From f63e48aab87eba4f37e1470b5463cb2a20061583 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Sat, 28 Jun 2025 16:14:03 +0200 Subject: [PATCH 01/91] chore: LangChain based accuracy tests --- package-lock.json | 295 +++++++++++++++++- tests/accuracy/list-databases.test.ts | 26 ++ tests/accuracy/sdk/accuracy-scorers.ts | 125 ++++++++ tests/accuracy/sdk/describe-accuracy-tests.ts | 51 +++ tests/accuracy/sdk/models.ts | 62 ++++ tests/accuracy/sdk/test-tools.ts | 153 +++++++++ tests/accuracy/sdk/tool-calling-agent.ts | 36 +++ 7 files changed, 744 insertions(+), 4 deletions(-) create mode 100644 tests/accuracy/list-databases.test.ts create mode 100644 tests/accuracy/sdk/accuracy-scorers.ts create mode 100644 tests/accuracy/sdk/describe-accuracy-tests.ts create mode 100644 tests/accuracy/sdk/models.ts create mode 100644 tests/accuracy/sdk/test-tools.ts create mode 100644 tests/accuracy/sdk/tool-calling-agent.ts diff --git a/package-lock.json b/package-lock.json index a80dcb27..b998d85c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -805,6 +805,13 @@ "node": ">=18" } }, + "node_modules/@cfworker/json-schema": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/@cfworker/json-schema/-/json-schema-4.1.1.tgz", + "integrity": "sha512-gAmrUZSGtKc3AiBL71iNWxDsyUC5uMaKKGdvzYsBoTW/xi42JQHl7eKV2OYzCUqvc+D2RCcf7EXY2iCyFIk6og==", + "dev": true, + "license": "MIT" + }, "node_modules/@cspotcode/source-map-support": { "version": "0.8.1", "resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz", @@ -1533,6 +1540,16 @@ "dev": true, "license": "MIT" }, + "node_modules/@google/generative-ai": { + "version": "0.24.1", + "resolved": "https://registry.npmjs.org/@google/generative-ai/-/generative-ai-0.24.1.tgz", + "integrity": "sha512-MqO+MLfM6kjxcKoy0p1wRzG3b4ZZXtPI+z2IE26UogS2Cm/XHO+7gGRBh6gcJsOiIVoH93UwKvW4HdgiOZCy9Q==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">=18.0.0" + } + }, "node_modules/@hapi/boom": { "version": "10.0.1", "resolved": "https://registry.npmjs.org/@hapi/boom/-/boom-10.0.1.tgz", @@ -1842,6 +1859,152 @@ "jsep": "^0.4.0||^1.0.0" } }, + "node_modules/@langchain/core": { + "version": "0.3.61", + "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.3.61.tgz", + "integrity": "sha512-4O7fw5SXNSE+uBnathLQrhm3t+7dZGagt/5kt37A+pXw0AkudxEBvveg73sSnpBd9SIz3/Vc7F4k8rCKXGbEDA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@cfworker/json-schema": "^4.0.2", + "ansi-styles": "^5.0.0", + "camelcase": "6", + "decamelize": "1.2.0", + "js-tiktoken": "^1.0.12", + "langsmith": "^0.3.33", + "mustache": "^4.2.0", + "p-queue": "^6.6.2", + "p-retry": "4", + "uuid": "^10.0.0", + "zod": "^3.25.32", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@langchain/core/node_modules/ansi-styles": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", + "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/@langchain/core/node_modules/camelcase": { + "version": "6.3.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-6.3.0.tgz", + "integrity": "sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/@langchain/core/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "dev": true, + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "license": "MIT", + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/@langchain/google-genai": { + "version": "0.2.14", + "resolved": "https://registry.npmjs.org/@langchain/google-genai/-/google-genai-0.2.14.tgz", + "integrity": "sha512-gKe/T2LNh8wSSMJOaFmYd8cwQnDSXKtVtC6a7CFoq5nWuh0bKzhItM/7bue1aMN8mlKfB2G1HCwxhaZoSpS/DA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@google/generative-ai": "^0.24.0", + "uuid": "^11.1.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.3.58 <0.4.0" + } + }, + "node_modules/@langchain/ollama": { + "version": "0.2.3", + "resolved": "https://registry.npmjs.org/@langchain/ollama/-/ollama-0.2.3.tgz", + "integrity": "sha512-1Obe45jgQspqLMBVlayQbGdywFmri8DgmGRdzNu0li56cG5RReYlRCFVDZBRMMvF9JhsP5eXRyfyivtKfITHWQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "ollama": "^0.5.12", + "uuid": "^10.0.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.3.58 <0.4.0" + } + }, + "node_modules/@langchain/ollama/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "dev": true, + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "license": "MIT", + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/@langchain/openai": { + "version": "0.5.16", + "resolved": "https://registry.npmjs.org/@langchain/openai/-/openai-0.5.16.tgz", + "integrity": "sha512-TqzPE3PM0bMkQi53qs8vCFkwaEp3VgwGw+s1e8Nas5ICCZZtc2XqcDPz4hf2gpo1k7/AZd6HuPlAsDy6wye9Qw==", + "dev": true, + "license": "MIT", + "dependencies": { + "js-tiktoken": "^1.0.12", + "openai": "^5.3.0", + "zod": "^3.25.32" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.3.58 <0.4.0" + } + }, + "node_modules/@langchain/textsplitters": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/@langchain/textsplitters/-/textsplitters-0.1.0.tgz", + "integrity": "sha512-djI4uw9rlkAb5iMhtLED+xJebDdAG935AdP4eRTB02R7OB/act55Bj9wsskhZsvuyQRpO4O1wQOp85s6T6GWmw==", + "dev": true, + "license": "MIT", + "dependencies": { + "js-tiktoken": "^1.0.12" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.2.21 <0.4.0" + } + }, "node_modules/@modelcontextprotocol/inspector": { "version": "0.16.1", "resolved": "https://registry.npmjs.org/@modelcontextprotocol/inspector/-/inspector-0.16.1.tgz", @@ -4762,6 +4925,19 @@ "node": ">=18.0.0" } }, + "node_modules/@smithy/middleware-retry/node_modules/uuid": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", + "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "license": "MIT", + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/@smithy/middleware-serde": { "version": "4.0.3", "resolved": "https://registry.npmjs.org/@smithy/middleware-serde/-/middleware-serde-4.0.3.tgz", @@ -5227,6 +5403,13 @@ "undici-types": "~7.8.0" } }, + "node_modules/@types/retry": { + "version": "0.12.0", + "resolved": "https://registry.npmjs.org/@types/retry/-/retry-0.12.0.tgz", + "integrity": "sha512-wWKOClTTiizcZhXnPY4wikVAwmdYHp8q6DmC+EJUzAMsycb7HB32Kh9RN4+0gExjmPmZSAQjgURXIGATPegAvA==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/simple-oauth2": { "version": "5.0.7", "resolved": "https://registry.npmjs.org/@types/simple-oauth2/-/simple-oauth2-5.0.7.tgz", @@ -5249,6 +5432,13 @@ "license": "MIT", "optional": true }, + "node_modules/@types/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-7gqG38EyHgyP1S+7+xomFtL+ZNHcKv6DwNaCZmJmo1vgMugyF3TCnXVg4t1uk89mLNwnLtnY3TpOpCOyp1/xHQ==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/webidl-conversions": { "version": "7.0.3", "resolved": "https://registry.npmjs.org/@types/webidl-conversions/-/webidl-conversions-7.0.3.tgz", @@ -6643,6 +6833,16 @@ "node": ">=12" } }, + "node_modules/console-table-printer": { + "version": "2.14.6", + "resolved": "https://registry.npmjs.org/console-table-printer/-/console-table-printer-2.14.6.tgz", + "integrity": "sha512-MCBl5HNVaFuuHW6FGbL/4fB7N/ormCy+tQ+sxTrF6QtSbSNETvPuOVbkJBhzDgYhvjWGrTma4eYJa37ZuoQsPw==", + "dev": true, + "license": "MIT", + "dependencies": { + "simple-wcswidth": "^1.0.1" + } + }, "node_modules/content-disposition": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-1.0.0.tgz", @@ -6804,6 +7004,16 @@ } } }, + "node_modules/decamelize": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz", + "integrity": "sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/decko": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/decko/-/decko-1.2.0.tgz", @@ -8974,6 +9184,16 @@ "node": ">=0.10.0" } }, + "node_modules/js-tiktoken": { + "version": "1.0.20", + "resolved": "https://registry.npmjs.org/js-tiktoken/-/js-tiktoken-1.0.20.tgz", + "integrity": "sha512-Xlaqhhs8VfCd6Sh7a1cFkZHQbYTLCwVJJWiHVxBYzLPxW0XsoxBy1hitmjkdIjD3Aon5BXLHFwU5O8WUx6HH+A==", + "dev": true, + "license": "MIT", + "dependencies": { + "base64-js": "^1.5.1" + } + }, "node_modules/js-tokens": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", @@ -9852,6 +10072,16 @@ "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", "license": "MIT" }, + "node_modules/mustache": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/mustache/-/mustache-4.2.0.tgz", + "integrity": "sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==", + "dev": true, + "license": "MIT", + "bin": { + "mustache": "bin/mustache" + } + }, "node_modules/nan": { "version": "2.23.0", "resolved": "https://registry.npmjs.org/nan/-/nan-2.23.0.tgz", @@ -10177,6 +10407,16 @@ "node": "^10.13.0 || >=12.0.0" } }, + "node_modules/ollama": { + "version": "0.5.16", + "resolved": "https://registry.npmjs.org/ollama/-/ollama-0.5.16.tgz", + "integrity": "sha512-OEbxxOIUZtdZgOaTPAULo051F5y+Z1vosxEYOoABPnQKeW7i4O8tJNlxCB+xioyoorVqgjkdj+TA1f1Hy2ug/w==", + "dev": true, + "license": "MIT", + "dependencies": { + "whatwg-fetch": "^3.6.20" + } + }, "node_modules/on-finished": { "version": "2.4.1", "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.4.1.tgz", @@ -10216,6 +10456,28 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/openai": { + "version": "5.8.2", + "resolved": "https://registry.npmjs.org/openai/-/openai-5.8.2.tgz", + "integrity": "sha512-8C+nzoHYgyYOXhHGN6r0fcb4SznuEn1R7YZMvlqDbnCuE0FM2mm3T1HiYW6WIcMS/F1Of2up/cSPjLPaWt0X9Q==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "ws": "^8.18.0", + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "ws": { + "optional": true + }, + "zod": { + "optional": true + } + } + }, "node_modules/openapi-fetch": { "version": "0.14.0", "resolved": "https://registry.npmjs.org/openapi-fetch/-/openapi-fetch-0.14.0.tgz", @@ -10403,6 +10665,16 @@ "dev": true, "license": "MIT" }, + "node_modules/p-finally": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/p-finally/-/p-finally-1.0.0.tgz", + "integrity": "sha512-LICb2p9CB7FS+0eR1oqWnHhp0FljGLZCWBE9aix0Uye9W8LTQPwMTYVGWQWIw9RdQiDg4+epXQODwIYJtSJaow==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, "node_modules/p-limit": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", @@ -11881,6 +12153,13 @@ "joi": "^17.6.4" } }, + "node_modules/simple-wcswidth": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/simple-wcswidth/-/simple-wcswidth-1.1.2.tgz", + "integrity": "sha512-j7piyCjAeTDSjzTSQ7DokZtMNwNlEAyxqSZeCS+CXH7fJ4jx3FuJ/mTW3mE+6JLs4VJBbcll0Kjn+KXI5t21Iw==", + "dev": true, + "license": "MIT" + }, "node_modules/simple-websocket": { "version": "9.1.0", "resolved": "https://registry.npmjs.org/simple-websocket/-/simple-websocket-9.1.0.tgz", @@ -13093,16 +13372,17 @@ } }, "node_modules/uuid": { - "version": "9.0.1", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", - "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "version": "11.1.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-11.1.0.tgz", + "integrity": "sha512-0/A9rDy9P7cJ+8w1c9WD9V//9Wj15Ce2MPz8Ri6032usz+NfePxx5AcN3bN+r6ZL6jEo066/yNYB3tn4pQEx+A==", + "dev": true, "funding": [ "https://github.com/sponsors/broofa", "https://github.com/sponsors/ctavan" ], "license": "MIT", "bin": { - "uuid": "dist/bin/uuid" + "uuid": "dist/esm/bin/uuid" } }, "node_modules/v8-compile-cache-lib": { @@ -13380,6 +13660,13 @@ "node": ">=12" } }, + "node_modules/whatwg-fetch": { + "version": "3.6.20", + "resolved": "https://registry.npmjs.org/whatwg-fetch/-/whatwg-fetch-3.6.20.tgz", + "integrity": "sha512-EqhiFU6daOA8kpjOWTL0olhVOF3i7OrFzSYiGsEMB8GcXS+RrzauAERX65xMeNWVqxA6HXH2m69Z9LaKKdisfg==", + "dev": true, + "license": "MIT" + }, "node_modules/whatwg-url": { "version": "14.2.0", "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.2.0.tgz", diff --git a/tests/accuracy/list-databases.test.ts b/tests/accuracy/list-databases.test.ts new file mode 100644 index 00000000..ae3f6c7d --- /dev/null +++ b/tests/accuracy/list-databases.test.ts @@ -0,0 +1,26 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; + +describeAccuracyTests("list-databases", getAvailableModels(), [ + { + prompt: "Assume that you're already connected. How many collections are there in sample_mflix database", + mockedTools: { + "list-collections": function listCollections() { + return { + content: [ + { + type: "text", + text: "Name: coll1", + }, + ], + }; + }, + }, + expectedToolCalls: [ + { + toolName: "list-collections", + parameters: { database: "sample_mflix" }, + }, + ], + }, +]); diff --git a/tests/accuracy/sdk/accuracy-scorers.ts b/tests/accuracy/sdk/accuracy-scorers.ts new file mode 100644 index 00000000..bf92eead --- /dev/null +++ b/tests/accuracy/sdk/accuracy-scorers.ts @@ -0,0 +1,125 @@ +export type ToolCall = { + toolCallId: string; + toolName: string; + parameters: unknown; +}; +export type ExpectedToolCall = Omit; + +export function toolCallingAccuracyScorer(expectedToolCalls: ExpectedToolCall[], actualToolCalls: ToolCall[]): number { + if (actualToolCalls.length < expectedToolCalls.length) { + return 0; + } + + const possibleScore = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1; + const checkedToolCallIds = new Set(); + for (const expectedToolCall of expectedToolCalls) { + const matchingActualToolCall = actualToolCalls.find( + (actualToolCall) => + actualToolCall.toolName === expectedToolCall.toolName && + !checkedToolCallIds.has(actualToolCall.toolCallId) + ); + + if (!matchingActualToolCall) { + return 0; + } + + checkedToolCallIds.add(matchingActualToolCall.toolCallId); + } + + return possibleScore; +} + +export function parameterMatchingAccuracyScorer( + expectedToolCalls: ExpectedToolCall[], + actualToolCalls: ToolCall[] +): number { + if (expectedToolCalls.length === 0) { + return 1; + } + + const toolCallScores: number[] = []; + const checkedToolCallIds = new Set(); + + for (const expectedToolCall of expectedToolCalls) { + const matchingActualToolCall = actualToolCalls.find( + (actualToolCall) => + actualToolCall.toolName === expectedToolCall.toolName && + !checkedToolCallIds.has(actualToolCall.toolCallId) + ); + + if (!matchingActualToolCall) { + toolCallScores.push(0); + continue; + } + + checkedToolCallIds.add(matchingActualToolCall.toolCallId); + const score = compareParams(expectedToolCall.parameters, matchingActualToolCall.parameters); + toolCallScores.push(score); + } + + const totalScore = toolCallScores.reduce((sum, score) => sum + score, 0); + return totalScore / toolCallScores.length; +} + +/** + * Recursively compares expected and actual parameters and returns a score. + * - 1: Perfect match. + * - 0.75: All expected parameters are present and match, but there are extra actual parameters. + * - 0: Missing parameters or mismatched values. + */ +function compareParams(expected: unknown, actual: unknown): number { + if (expected === null || expected === undefined) { + return actual === null || actual === undefined ? 1 : 0; + } + if (actual === null || actual === undefined) { + return 0; + } + + if (Array.isArray(expected)) { + if (!Array.isArray(actual) || actual.length < expected.length) { + return 0; + } + let minScore = 1; + for (let i = 0; i < expected.length; i++) { + minScore = Math.min(minScore, compareParams(expected[i], actual[i])); + } + if (minScore === 0) { + return 0; + } + if (actual.length > expected.length) { + minScore = Math.min(minScore, 0.75); + } + return minScore; + } + + if (typeof expected === "object") { + if (typeof actual !== "object" || Array.isArray(actual)) { + return 0; + } + const expectedKeys = Object.keys(expected as Record); + const actualKeys = Object.keys(actual as Record); + + let minScore = 1; + for (const key of expectedKeys) { + if (!Object.prototype.hasOwnProperty.call(actual, key)) { + return 0; + } + minScore = Math.min( + minScore, + compareParams((expected as Record)[key], (actual as Record)[key]) + ); + } + + if (minScore === 0) { + return 0; + } + + if (actualKeys.length > expectedKeys.length) { + minScore = Math.min(minScore, 0.75); + } + return minScore; + } + + // eslint-disable-next-line eqeqeq + return expected == actual ? 1 : 0; +} diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts new file mode 100644 index 00000000..0ec4bb64 --- /dev/null +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -0,0 +1,51 @@ +import { AgentExecutor } from "langchain/agents"; +import { Tool } from "@modelcontextprotocol/sdk/types.js"; +import { discoverMongoDBTools, TestTools, ToolResultGenerators } from "./test-tools.js"; +import { TestableModels } from "./models.js"; +import { getToolCallingAgent } from "./tool-calling-agent.js"; +import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyScorer } from "./accuracy-scorers.js"; + +interface AccuracyTestConfig { + prompt: string; + expectedToolCalls: ExpectedToolCall[]; + mockedTools: ToolResultGenerators; +} + +export function describeAccuracyTests( + suiteName: string, + models: TestableModels, + accuracyTestConfigs: AccuracyTestConfig[] +) { + const eachModel = describe.each(models); + const eachTest = it.each(accuracyTestConfigs); + + eachModel(`$modelName - ${suiteName}`, function (model) { + let mcpTools: Tool[]; + let testTools: TestTools; + let agent: AgentExecutor; + + beforeAll(async () => { + mcpTools = await discoverMongoDBTools(); + }); + + beforeEach(() => { + testTools = new TestTools(mcpTools); + const transformToolResult = model.transformToolResult.bind(model); + agent = getToolCallingAgent(model, testTools.langChainTools(transformToolResult)); + }); + + eachTest("$prompt", async function (testConfig) { + testTools.mockTools(testConfig.mockedTools); + const conversation = await agent.invoke({ input: testConfig.prompt }); + console.log("conversation", conversation); + const toolCalls = testTools.getToolCalls(); + console.log("?????? toolCalls", toolCalls); + console.log("???? expected", testConfig.expectedToolCalls); + const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); + const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); + + expect(toolCallingAccuracy).not.toEqual(0); + expect(parameterMatchingAccuracy).toBeGreaterThanOrEqual(0.5); + }); + }); +} diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts new file mode 100644 index 00000000..d370633f --- /dev/null +++ b/tests/accuracy/sdk/models.ts @@ -0,0 +1,62 @@ +import { BaseChatModel } from "@langchain/core/language_models/chat_models"; +import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; +import { ChatOllama } from "@langchain/ollama"; +import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; + +type ToolResultForOllama = string; +export type AcceptableToolResponse = CallToolResult | ToolResultForOllama; + +export interface Model { + isAvailable(): boolean; + getLangChainModel(): M; + transformToolResult(callToolResult: CallToolResult): T; +} + +export class GeminiModel implements Model { + constructor(readonly modelName: string) {} + + isAvailable(): boolean { + return !!process.env.MDB_GEMINI_API_KEY; + } + + getLangChainModel(): ChatGoogleGenerativeAI { + return new ChatGoogleGenerativeAI({ + model: this.modelName, + apiKey: process.env.MDB_GEMINI_API_KEY, + }); + } + + transformToolResult(callToolResult: CallToolResult) { + return callToolResult; + } +} + +export class OllamaModel implements Model { + constructor(readonly modelName: string) {} + + isAvailable(): boolean { + return !!process.env.MDB_GEMINI_API_KEY; + } + + getLangChainModel(): ChatOllama { + return new ChatOllama({ + model: this.modelName, + }); + } + + transformToolResult(callToolResult: CallToolResult): ToolResultForOllama { + return JSON.stringify(callToolResult); + } +} + +const ALL_TESTABLE_MODELS = [ + // new GeminiModel("gemini-1.5-flash"), + // new GeminiModel("gemini-2.0-flash"), + new OllamaModel("qwen3:latest"), +]; + +export type TestableModels = ReturnType; + +export function getAvailableModels() { + return ALL_TESTABLE_MODELS.filter((model) => model.isAvailable()); +} diff --git a/tests/accuracy/sdk/test-tools.ts b/tests/accuracy/sdk/test-tools.ts new file mode 100644 index 00000000..82719454 --- /dev/null +++ b/tests/accuracy/sdk/test-tools.ts @@ -0,0 +1,153 @@ +import { jest } from "@jest/globals"; +import { v4 as uuid } from "uuid"; +import { DynamicTool, tool as langChainTool } from "@langchain/core/tools"; +import { Client } from "@modelcontextprotocol/sdk/client/index.js"; +import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import { CallToolResult, Tool } from "@modelcontextprotocol/sdk/types.js"; + +import { InMemoryTransport } from "../../integration/inMemoryTransport.js"; +import { defaultTestConfig } from "../../integration/helpers.js"; +import { Session } from "../../../src/session.js"; +import { Telemetry } from "../../../src/telemetry/telemetry.js"; +import { Server } from "../../../src/server.js"; +import { AcceptableToolResponse } from "./models.js"; +import { ToolCall } from "./accuracy-scorers.js"; + +type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult; +type MockedToolResultGeneratorFn = jest.MockedFunction; +type MockedTools = Record; +export type ToolResultGenerators = Record; +export type LangChainTool = DynamicTool; +export type ToolResultTransformer = (toolResult: CallToolResult) => T; + +export class TestTools { + private mockedTools: MockedTools = {}; + private recordedToolCalls: ToolCall[] = []; + + constructor(private readonly mcpTools: Tool[]) { + for (const mcpTool of mcpTools) { + this.mockedTools[mcpTool.name] = jest.fn().mockReturnValue({ + content: [ + { + type: "text", + text: `Mock implementation for tool - ${mcpTool.name} not present`, + }, + ], + isError: true, + }); + } + } + + getToolCalls() { + return this.recordedToolCalls; + } + + mockTools(toolResultGenerators: ToolResultGenerators) { + for (const toolName in toolResultGenerators) { + const toolResultGeneratorFn = toolResultGenerators[toolName]; + if (!this.mockedTools[toolName]) { + throw new Error(`Attempted to mock unrecognized tool - ${toolName}`); + } + + if (!toolResultGeneratorFn) { + // Are you happy TS? + continue; + } + this.mockedTools[toolName] = jest.fn(toolResultGeneratorFn); + } + } + + langChainTools( + transformToolResult: ToolResultTransformer + ): LangChainTool[] { + return this.mcpTools.map((mcpTool) => { + return langChainTool((...args) => { + console.log("????? args", args); + const [parameters, { runName, runId }] = args; + const toolCallId = typeof runId !== "undefined" ? `${runId}` : uuid(); + return this.langChainToolResultGenerator(`${runName}`, parameters, toolCallId, transformToolResult); + }, mcpTool); + }); + } + + private langChainToolResultGenerator( + tool: string, + parameters: unknown, + toolCallId: string, + transformToolResult: ToolResultTransformer + ): T { + this.recordedToolCalls.push({ + toolCallId: toolCallId, + toolName: tool, + parameters, + }); + const mockedToolResultGenerator = this.mockedTools[tool]; + if (!mockedToolResultGenerator) { + // log as well + return transformToolResult({ + content: [ + { + type: "text", + text: `Could not resolve tool generator for ${tool}`, + }, + ], + isError: true, + }); + } + + return transformToolResult(mockedToolResultGenerator(parameters)); + } +} + +export async function discoverMongoDBTools(): Promise { + let mcpClient: Client | undefined; + let mcpServer: Server | undefined; + try { + const serverTransport = new InMemoryTransport(); + const clientTransport = new InMemoryTransport(); + + await serverTransport.start(); + await clientTransport.start(); + + void serverTransport.output.pipeTo(clientTransport.input); + void clientTransport.output.pipeTo(serverTransport.input); + + const session = new Session({ + apiBaseUrl: defaultTestConfig.apiBaseUrl, + }); + + const telemetry = Telemetry.create(session, defaultTestConfig); + + mcpClient = new Client( + { + name: "tool-discovery-client", + version: "0.0.0", + }, + { + capabilities: {}, + } + ); + + mcpServer = new Server({ + session, + userConfig: defaultTestConfig, + telemetry, + mcpServer: new McpServer({ + name: "test-server", + version: "5.2.3", + }), + }); + + await mcpServer.connect(serverTransport); + await mcpClient.connect(clientTransport); + + return (await mcpClient.listTools()).tools; + } catch (error: unknown) { + console.error("Unexpected error occured", error); + return []; + } finally { + await mcpClient?.close(); + await mcpServer?.session?.close(); + await mcpServer?.close(); + } +} diff --git a/tests/accuracy/sdk/tool-calling-agent.ts b/tests/accuracy/sdk/tool-calling-agent.ts new file mode 100644 index 00000000..b9adedf5 --- /dev/null +++ b/tests/accuracy/sdk/tool-calling-agent.ts @@ -0,0 +1,36 @@ +import { ChatPromptTemplate } from "@langchain/core/prompts"; +import { createToolCallingAgent, AgentExecutor } from "langchain/agents"; + +import { LangChainTool } from "./test-tools.js"; +import { AcceptableToolResponse, Model } from "./models.js"; +import { BaseChatModel } from "@langchain/core/language_models/chat_models"; + +const prompt = ChatPromptTemplate.fromMessages([ + [ + "system", + [ + 'The keywords "MUST", "MUST NOT", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in RFC 2119 (https://www.ietf.org/rfc/rfc2119.txt)', + "You are an expect AI assistant with access to a set of tools for MongoDB database operations.", + "You MUST use the most relevant tool to answer the user's request", + "When calling a tool, you MUST strictly follow its input schema and MUST provide all required arguments", + "If a task requires multiple steps, you MUST call the necessary tools in sequence", + 'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"', + ].join("\n"), + ], + ["human", "{input}"], + ["placeholder", "{agent_scratchpad}"], +]); + +export function getToolCallingAgent( + model: Model, + tools: LangChainTool[] +) { + const llm = model.getLangChainModel(); + const agent = createToolCallingAgent({ + llm, + tools, + prompt, + }); + const agentExecutor = new AgentExecutor({ agent, tools }); + return agentExecutor; +} From 7efe7be64fd6f8a33ab577c30737f87d497b2ae5 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 30 Jun 2025 11:06:38 +0200 Subject: [PATCH 02/91] chore: use vercel AI SDK instead of langchain LangChain's ToolCalling agent was not providing a structured tool call response and different model providers were providing entirely different tool calls for the same tool definition which was too turbulent for us to have any accuracy baseline at all. Vercel's AI SDK pushes us forward on that problem and the tool call responses so far have always been well structured. This commit replaces LangChain based implementation with Vercel's AI SDK based implementation. --- package-lock.json | 512 +++++++++--------- tests/accuracy/sdk/agent.ts | 38 ++ tests/accuracy/sdk/describe-accuracy-tests.ts | 14 +- tests/accuracy/sdk/models.ts | 46 +- tests/accuracy/sdk/test-tools.ts | 106 ++-- tests/accuracy/sdk/tool-calling-agent.ts | 36 -- 6 files changed, 350 insertions(+), 402 deletions(-) create mode 100644 tests/accuracy/sdk/agent.ts delete mode 100644 tests/accuracy/sdk/tool-calling-agent.ts diff --git a/package-lock.json b/package-lock.json index b998d85c..497cf564 100644 --- a/package-lock.json +++ b/package-lock.json @@ -56,6 +56,83 @@ "node": "^20.19.0 || ^22.12.0 || >= 23.0.0" } }, + "@himanshusinghs/ai-sdk-google": { + "extraneous": true + }, + "node_modules/@ai-sdk/provider": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.3.tgz", + "integrity": "sha512-qZMxYJ0qqX/RfnuIaab+zp8UAeJn/ygXXAffR5I4N0n1IrvA6qBsjc8hXLmBiMV2zoXlifkacF7sEFnYnjBcqg==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@ai-sdk/provider-utils": { + "version": "2.2.8", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.8.tgz", + "integrity": "sha512-fqhG+4sCVv8x7nFzYnFo19ryhAa3w096Kmc3hWxMQfW/TubPOmt3A6tYZhl4mUfQWWQMsuSkLrtjlWuXBVSGQA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "nanoid": "^3.3.8", + "secure-json-parse": "^2.7.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, + "node_modules/@ai-sdk/react": { + "version": "1.2.12", + "resolved": "https://registry.npmjs.org/@ai-sdk/react/-/react-1.2.12.tgz", + "integrity": "sha512-jK1IZZ22evPZoQW3vlkZ7wvjYGYF+tRBKXtrcolduIkQ/m/sOAVcVeVDUDvh1T91xCnWCdUGCPZg2avZ90mv3g==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider-utils": "2.2.8", + "@ai-sdk/ui-utils": "1.2.11", + "swr": "^2.2.5", + "throttleit": "2.1.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "react": "^18 || ^19 || ^19.0.0-rc", + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "zod": { + "optional": true + } + } + }, + "node_modules/@ai-sdk/ui-utils": { + "version": "1.2.11", + "resolved": "https://registry.npmjs.org/@ai-sdk/ui-utils/-/ui-utils-1.2.11.tgz", + "integrity": "sha512-3zcwCc8ezzFlwp3ZD15wAPjf2Au4s3vAbKsXQVyhxODHcmu0iyPO2Eua6D/vicq/AUm/BAo60r97O6HU+EI0+w==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8", + "zod-to-json-schema": "^3.24.1" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, "node_modules/@ampproject/remapping": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/@ampproject/remapping/-/remapping-2.3.0.tgz", @@ -805,13 +882,6 @@ "node": ">=18" } }, - "node_modules/@cfworker/json-schema": { - "version": "4.1.1", - "resolved": "https://registry.npmjs.org/@cfworker/json-schema/-/json-schema-4.1.1.tgz", - "integrity": "sha512-gAmrUZSGtKc3AiBL71iNWxDsyUC5uMaKKGdvzYsBoTW/xi42JQHl7eKV2OYzCUqvc+D2RCcf7EXY2iCyFIk6og==", - "dev": true, - "license": "MIT" - }, "node_modules/@cspotcode/source-map-support": { "version": "0.8.1", "resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz", @@ -1540,16 +1610,6 @@ "dev": true, "license": "MIT" }, - "node_modules/@google/generative-ai": { - "version": "0.24.1", - "resolved": "https://registry.npmjs.org/@google/generative-ai/-/generative-ai-0.24.1.tgz", - "integrity": "sha512-MqO+MLfM6kjxcKoy0p1wRzG3b4ZZXtPI+z2IE26UogS2Cm/XHO+7gGRBh6gcJsOiIVoH93UwKvW4HdgiOZCy9Q==", - "dev": true, - "license": "Apache-2.0", - "engines": { - "node": ">=18.0.0" - } - }, "node_modules/@hapi/boom": { "version": "10.0.1", "resolved": "https://registry.npmjs.org/@hapi/boom/-/boom-10.0.1.tgz", @@ -1597,6 +1657,54 @@ "@hapi/hoek": "^11.0.2" } }, + "node_modules/@himanshusinghs/google": { + "version": "1.2.11", + "resolved": "https://registry.npmjs.org/@himanshusinghs/google/-/google-1.2.11.tgz", + "integrity": "sha512-SKTFxwN9PpUHVrppFod8sF1jqys5azzsgcBVrSbc7VaazmVEnBxHQlv5/yfeZFjD3ly5Mw+AJdFfC0bxwdWBNg==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.2", + "@ai-sdk/provider-utils": "2.2.6" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, + "node_modules/@himanshusinghs/google/node_modules/@ai-sdk/provider": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.2.tgz", + "integrity": "sha512-ITdgNilJZwLKR7X5TnUr1BsQW6UTX5yFp0h66Nfx8XjBYkWD9W3yugr50GOz3CnE9m/U/Cd5OyEbTMI0rgi6ZQ==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@himanshusinghs/google/node_modules/@ai-sdk/provider-utils": { + "version": "2.2.6", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.6.tgz", + "integrity": "sha512-sUlZ7Gnq84DCGWMQRIK8XVbkzIBnvPR1diV4v6JwPgpn5armnLI/j+rqn62MpLrU5ZCQZlDKl/Lw6ed3ulYqaA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.2", + "nanoid": "^3.3.8", + "secure-json-parse": "^2.7.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, "node_modules/@humanfs/core": { "version": "0.19.1", "resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz", @@ -1859,152 +1967,6 @@ "jsep": "^0.4.0||^1.0.0" } }, - "node_modules/@langchain/core": { - "version": "0.3.61", - "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.3.61.tgz", - "integrity": "sha512-4O7fw5SXNSE+uBnathLQrhm3t+7dZGagt/5kt37A+pXw0AkudxEBvveg73sSnpBd9SIz3/Vc7F4k8rCKXGbEDA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@cfworker/json-schema": "^4.0.2", - "ansi-styles": "^5.0.0", - "camelcase": "6", - "decamelize": "1.2.0", - "js-tiktoken": "^1.0.12", - "langsmith": "^0.3.33", - "mustache": "^4.2.0", - "p-queue": "^6.6.2", - "p-retry": "4", - "uuid": "^10.0.0", - "zod": "^3.25.32", - "zod-to-json-schema": "^3.22.3" - }, - "engines": { - "node": ">=18" - } - }, - "node_modules/@langchain/core/node_modules/ansi-styles": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", - "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/@langchain/core/node_modules/camelcase": { - "version": "6.3.0", - "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-6.3.0.tgz", - "integrity": "sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/@langchain/core/node_modules/uuid": { - "version": "10.0.0", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", - "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", - "dev": true, - "funding": [ - "https://github.com/sponsors/broofa", - "https://github.com/sponsors/ctavan" - ], - "license": "MIT", - "bin": { - "uuid": "dist/bin/uuid" - } - }, - "node_modules/@langchain/google-genai": { - "version": "0.2.14", - "resolved": "https://registry.npmjs.org/@langchain/google-genai/-/google-genai-0.2.14.tgz", - "integrity": "sha512-gKe/T2LNh8wSSMJOaFmYd8cwQnDSXKtVtC6a7CFoq5nWuh0bKzhItM/7bue1aMN8mlKfB2G1HCwxhaZoSpS/DA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@google/generative-ai": "^0.24.0", - "uuid": "^11.1.0" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "@langchain/core": ">=0.3.58 <0.4.0" - } - }, - "node_modules/@langchain/ollama": { - "version": "0.2.3", - "resolved": "https://registry.npmjs.org/@langchain/ollama/-/ollama-0.2.3.tgz", - "integrity": "sha512-1Obe45jgQspqLMBVlayQbGdywFmri8DgmGRdzNu0li56cG5RReYlRCFVDZBRMMvF9JhsP5eXRyfyivtKfITHWQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "ollama": "^0.5.12", - "uuid": "^10.0.0" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "@langchain/core": ">=0.3.58 <0.4.0" - } - }, - "node_modules/@langchain/ollama/node_modules/uuid": { - "version": "10.0.0", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", - "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", - "dev": true, - "funding": [ - "https://github.com/sponsors/broofa", - "https://github.com/sponsors/ctavan" - ], - "license": "MIT", - "bin": { - "uuid": "dist/bin/uuid" - } - }, - "node_modules/@langchain/openai": { - "version": "0.5.16", - "resolved": "https://registry.npmjs.org/@langchain/openai/-/openai-0.5.16.tgz", - "integrity": "sha512-TqzPE3PM0bMkQi53qs8vCFkwaEp3VgwGw+s1e8Nas5ICCZZtc2XqcDPz4hf2gpo1k7/AZd6HuPlAsDy6wye9Qw==", - "dev": true, - "license": "MIT", - "dependencies": { - "js-tiktoken": "^1.0.12", - "openai": "^5.3.0", - "zod": "^3.25.32" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "@langchain/core": ">=0.3.58 <0.4.0" - } - }, - "node_modules/@langchain/textsplitters": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/@langchain/textsplitters/-/textsplitters-0.1.0.tgz", - "integrity": "sha512-djI4uw9rlkAb5iMhtLED+xJebDdAG935AdP4eRTB02R7OB/act55Bj9wsskhZsvuyQRpO4O1wQOp85s6T6GWmw==", - "dev": true, - "license": "MIT", - "dependencies": { - "js-tiktoken": "^1.0.12" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "@langchain/core": ">=0.2.21 <0.4.0" - } - }, "node_modules/@modelcontextprotocol/inspector": { "version": "0.16.1", "resolved": "https://registry.npmjs.org/@modelcontextprotocol/inspector/-/inspector-0.16.1.tgz", @@ -5381,6 +5343,13 @@ "devOptional": true, "license": "MIT" }, + "node_modules/@types/diff-match-patch": { + "version": "1.0.36", + "resolved": "https://registry.npmjs.org/@types/diff-match-patch/-/diff-match-patch-1.0.36.tgz", + "integrity": "sha512-xFdR6tkm0MWvBfO8xXCSsinYxHcqkQUlcHeSpMC2ukzOb6lwQAfDmW+Qt0AvlGd8HpsS28qKsB+oPeJn9I39jg==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/estree": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", @@ -5403,13 +5372,6 @@ "undici-types": "~7.8.0" } }, - "node_modules/@types/retry": { - "version": "0.12.0", - "resolved": "https://registry.npmjs.org/@types/retry/-/retry-0.12.0.tgz", - "integrity": "sha512-wWKOClTTiizcZhXnPY4wikVAwmdYHp8q6DmC+EJUzAMsycb7HB32Kh9RN4+0gExjmPmZSAQjgURXIGATPegAvA==", - "dev": true, - "license": "MIT" - }, "node_modules/@types/simple-oauth2": { "version": "5.0.7", "resolved": "https://registry.npmjs.org/@types/simple-oauth2/-/simple-oauth2-5.0.7.tgz", @@ -5432,13 +5394,6 @@ "license": "MIT", "optional": true }, - "node_modules/@types/uuid": { - "version": "10.0.0", - "resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-10.0.0.tgz", - "integrity": "sha512-7gqG38EyHgyP1S+7+xomFtL+ZNHcKv6DwNaCZmJmo1vgMugyF3TCnXVg4t1uk89mLNwnLtnY3TpOpCOyp1/xHQ==", - "dev": true, - "license": "MIT" - }, "node_modules/@types/webidl-conversions": { "version": "7.0.3", "resolved": "https://registry.npmjs.org/@types/webidl-conversions/-/webidl-conversions-7.0.3.tgz", @@ -5929,6 +5884,33 @@ "node": ">= 14" } }, + "node_modules/ai": { + "version": "4.3.16", + "resolved": "https://registry.npmjs.org/ai/-/ai-4.3.16.tgz", + "integrity": "sha512-KUDwlThJ5tr2Vw0A1ZkbDKNME3wzWhuVfAOwIvFUzl1TPVDFAXDFTXio3p+jaKneB+dKNCvFFlolYmmgHttG1g==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8", + "@ai-sdk/react": "1.2.12", + "@ai-sdk/ui-utils": "1.2.11", + "@opentelemetry/api": "1.9.0", + "jsondiffpatch": "0.6.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "react": "^18 || ^19 || ^19.0.0-rc", + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "react": { + "optional": true + } + } + }, "node_modules/ajv": { "version": "6.12.6", "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", @@ -6833,16 +6815,6 @@ "node": ">=12" } }, - "node_modules/console-table-printer": { - "version": "2.14.6", - "resolved": "https://registry.npmjs.org/console-table-printer/-/console-table-printer-2.14.6.tgz", - "integrity": "sha512-MCBl5HNVaFuuHW6FGbL/4fB7N/ormCy+tQ+sxTrF6QtSbSNETvPuOVbkJBhzDgYhvjWGrTma4eYJa37ZuoQsPw==", - "dev": true, - "license": "MIT", - "dependencies": { - "simple-wcswidth": "^1.0.1" - } - }, "node_modules/content-disposition": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-1.0.0.tgz", @@ -7004,16 +6976,6 @@ } } }, - "node_modules/decamelize": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz", - "integrity": "sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/decko": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/decko/-/decko-1.2.0.tgz", @@ -7323,6 +7285,16 @@ "node": ">= 0.8" } }, + "node_modules/dequal": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz", + "integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/destroy": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.2.0.tgz", @@ -7360,6 +7332,13 @@ "node": ">=0.3.1" } }, + "node_modules/diff-match-patch": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/diff-match-patch/-/diff-match-patch-1.0.5.tgz", + "integrity": "sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw==", + "dev": true, + "license": "Apache-2.0" + }, "node_modules/diff-sequences": { "version": "29.6.3", "resolved": "https://registry.npmjs.org/diff-sequences/-/diff-sequences-29.6.3.tgz", @@ -9184,16 +9163,6 @@ "node": ">=0.10.0" } }, - "node_modules/js-tiktoken": { - "version": "1.0.20", - "resolved": "https://registry.npmjs.org/js-tiktoken/-/js-tiktoken-1.0.20.tgz", - "integrity": "sha512-Xlaqhhs8VfCd6Sh7a1cFkZHQbYTLCwVJJWiHVxBYzLPxW0XsoxBy1hitmjkdIjD3Aon5BXLHFwU5O8WUx6HH+A==", - "dev": true, - "license": "MIT", - "dependencies": { - "base64-js": "^1.5.1" - } - }, "node_modules/js-tokens": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", @@ -9245,6 +9214,13 @@ "foreach": "^2.0.4" } }, + "node_modules/json-schema": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.4.0.tgz", + "integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==", + "dev": true, + "license": "(AFL-2.1 OR BSD-3-Clause)" + }, "node_modules/json-schema-traverse": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", @@ -10072,16 +10048,6 @@ "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", "license": "MIT" }, - "node_modules/mustache": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/mustache/-/mustache-4.2.0.tgz", - "integrity": "sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==", - "dev": true, - "license": "MIT", - "bin": { - "mustache": "bin/mustache" - } - }, "node_modules/nan": { "version": "2.23.0", "resolved": "https://registry.npmjs.org/nan/-/nan-2.23.0.tgz", @@ -10407,14 +10373,27 @@ "node": "^10.13.0 || >=12.0.0" } }, - "node_modules/ollama": { - "version": "0.5.16", - "resolved": "https://registry.npmjs.org/ollama/-/ollama-0.5.16.tgz", - "integrity": "sha512-OEbxxOIUZtdZgOaTPAULo051F5y+Z1vosxEYOoABPnQKeW7i4O8tJNlxCB+xioyoorVqgjkdj+TA1f1Hy2ug/w==", + "node_modules/ollama-ai-provider": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/ollama-ai-provider/-/ollama-ai-provider-1.2.0.tgz", + "integrity": "sha512-jTNFruwe3O/ruJeppI/quoOUxG7NA6blG3ZyQj3lei4+NnJo7bi3eIRWqlVpRlu/mbzbFXeJSBuYQWF6pzGKww==", "dev": true, - "license": "MIT", + "license": "Apache-2.0", "dependencies": { - "whatwg-fetch": "^3.6.20" + "@ai-sdk/provider": "^1.0.0", + "@ai-sdk/provider-utils": "^2.0.0", + "partial-json": "0.1.7" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + }, + "peerDependenciesMeta": { + "zod": { + "optional": true + } } }, "node_modules/on-finished": { @@ -10456,28 +10435,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/openai": { - "version": "5.8.2", - "resolved": "https://registry.npmjs.org/openai/-/openai-5.8.2.tgz", - "integrity": "sha512-8C+nzoHYgyYOXhHGN6r0fcb4SznuEn1R7YZMvlqDbnCuE0FM2mm3T1HiYW6WIcMS/F1Of2up/cSPjLPaWt0X9Q==", - "dev": true, - "license": "Apache-2.0", - "bin": { - "openai": "bin/cli" - }, - "peerDependencies": { - "ws": "^8.18.0", - "zod": "^3.23.8" - }, - "peerDependenciesMeta": { - "ws": { - "optional": true - }, - "zod": { - "optional": true - } - } - }, "node_modules/openapi-fetch": { "version": "0.14.0", "resolved": "https://registry.npmjs.org/openapi-fetch/-/openapi-fetch-0.14.0.tgz", @@ -10665,16 +10622,6 @@ "dev": true, "license": "MIT" }, - "node_modules/p-finally": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/p-finally/-/p-finally-1.0.0.tgz", - "integrity": "sha512-LICb2p9CB7FS+0eR1oqWnHhp0FljGLZCWBE9aix0Uye9W8LTQPwMTYVGWQWIw9RdQiDg4+epXQODwIYJtSJaow==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=4" - } - }, "node_modules/p-limit": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", @@ -10765,6 +10712,13 @@ "node": ">= 0.8" } }, + "node_modules/partial-json": { + "version": "0.1.7", + "resolved": "https://registry.npmjs.org/partial-json/-/partial-json-0.1.7.tgz", + "integrity": "sha512-Njv/59hHaokb/hRUjce3Hdv12wd60MtM9Z5Olmn+nehe0QDAsRtRbJPvJ0Z91TusF0SuZRIvnM+S4l6EIP8leA==", + "dev": true, + "license": "MIT" + }, "node_modules/path-browserify": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/path-browserify/-/path-browserify-1.0.1.tgz", @@ -11707,6 +11661,13 @@ "loose-envify": "^1.1.0" } }, + "node_modules/secure-json-parse": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-2.7.0.tgz", + "integrity": "sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw==", + "dev": true, + "license": "BSD-3-Clause" + }, "node_modules/seek-bzip": { "version": "1.0.6", "resolved": "https://registry.npmjs.org/seek-bzip/-/seek-bzip-1.0.6.tgz", @@ -12153,13 +12114,6 @@ "joi": "^17.6.4" } }, - "node_modules/simple-wcswidth": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/simple-wcswidth/-/simple-wcswidth-1.1.2.tgz", - "integrity": "sha512-j7piyCjAeTDSjzTSQ7DokZtMNwNlEAyxqSZeCS+CXH7fJ4jx3FuJ/mTW3mE+6JLs4VJBbcll0Kjn+KXI5t21Iw==", - "dev": true, - "license": "MIT" - }, "node_modules/simple-websocket": { "version": "9.1.0", "resolved": "https://registry.npmjs.org/simple-websocket/-/simple-websocket-9.1.0.tgz", @@ -12624,6 +12578,20 @@ "node": ">= 6" } }, + "node_modules/swr": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/swr/-/swr-2.3.3.tgz", + "integrity": "sha512-dshNvs3ExOqtZ6kJBaAsabhPdHyeY4P2cKwRCniDVifBMoG/SVI7tfLWqPXriVspf2Rg4tPzXJTnwaihIeFw2A==", + "dev": true, + "license": "MIT", + "dependencies": { + "dequal": "^2.0.3", + "use-sync-external-store": "^1.4.0" + }, + "peerDependencies": { + "react": "^16.11.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" + } + }, "node_modules/synckit": { "version": "0.11.8", "resolved": "https://registry.npmjs.org/synckit/-/synckit-0.11.8.tgz", @@ -12867,6 +12835,19 @@ "node": ">=16 || 14 >=14.17" } }, + "node_modules/throttleit": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/throttleit/-/throttleit-2.1.0.tgz", + "integrity": "sha512-nt6AMGKW1p/70DF/hGBdJB57B8Tspmbp5gfJ8ilhLnt7kkr2ye7hzD6NVG8GGErk2HWF34igrL2CXmNIkzKqKw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/through": { "version": "2.3.8", "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz", @@ -13660,13 +13641,6 @@ "node": ">=12" } }, - "node_modules/whatwg-fetch": { - "version": "3.6.20", - "resolved": "https://registry.npmjs.org/whatwg-fetch/-/whatwg-fetch-3.6.20.tgz", - "integrity": "sha512-EqhiFU6daOA8kpjOWTL0olhVOF3i7OrFzSYiGsEMB8GcXS+RrzauAERX65xMeNWVqxA6HXH2m69Z9LaKKdisfg==", - "dev": true, - "license": "MIT" - }, "node_modules/whatwg-url": { "version": "14.2.0", "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.2.0.tgz", diff --git a/tests/accuracy/sdk/agent.ts b/tests/accuracy/sdk/agent.ts new file mode 100644 index 00000000..905cfff9 --- /dev/null +++ b/tests/accuracy/sdk/agent.ts @@ -0,0 +1,38 @@ +import { generateText, Tool, Schema, LanguageModelV1 } from "ai"; +import { Model } from "./models.js"; + +const systemPrompt = [ + 'The keywords "MUST", "MUST NOT", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in RFC 2119', + "You are an expert AI assistant with access to a set of tools for MongoDB database operations.", + "You MUST use the most relevant tool to answer the user's request", + "When calling a tool, you MUST strictly follow its input schema and MUST provide all required arguments", + "If a task requires multiple steps, you MUST call the necessary tools in sequence", + 'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"', + "You SHOULD assume that you are already connected to a MongoDB connection", +].join("\n"); + +export interface Agent { + prompt(prompt: string, model: M, tools: T): Promise; +} + +export function getVercelToolCallingAgent(): Agent< + Model, + Record>>, + { text: string; messages: unknown[] } +> { + return { + async prompt(prompt: string, model: Model, tools: Record>>) { + const result = await generateText({ + model: model.getModel(), + system: systemPrompt, + prompt, + tools, + maxSteps: 100, + }); + return { + text: result.text, + messages: result.response.messages, + }; + }, + }; +} diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 0ec4bb64..97496f6e 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -1,14 +1,13 @@ -import { AgentExecutor } from "langchain/agents"; import { Tool } from "@modelcontextprotocol/sdk/types.js"; -import { discoverMongoDBTools, TestTools, ToolResultGenerators } from "./test-tools.js"; +import { discoverMongoDBTools, TestTools, MockedTools } from "./test-tools.js"; import { TestableModels } from "./models.js"; -import { getToolCallingAgent } from "./tool-calling-agent.js"; import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyScorer } from "./accuracy-scorers.js"; +import { Agent, getVercelToolCallingAgent } from "./agent.js"; interface AccuracyTestConfig { prompt: string; expectedToolCalls: ExpectedToolCall[]; - mockedTools: ToolResultGenerators; + mockedTools: MockedTools; } export function describeAccuracyTests( @@ -22,7 +21,7 @@ export function describeAccuracyTests( eachModel(`$modelName - ${suiteName}`, function (model) { let mcpTools: Tool[]; let testTools: TestTools; - let agent: AgentExecutor; + let agent: Agent; beforeAll(async () => { mcpTools = await discoverMongoDBTools(); @@ -30,13 +29,12 @@ export function describeAccuracyTests( beforeEach(() => { testTools = new TestTools(mcpTools); - const transformToolResult = model.transformToolResult.bind(model); - agent = getToolCallingAgent(model, testTools.langChainTools(transformToolResult)); + agent = getVercelToolCallingAgent(); }); eachTest("$prompt", async function (testConfig) { testTools.mockTools(testConfig.mockedTools); - const conversation = await agent.invoke({ input: testConfig.prompt }); + const conversation = await agent.prompt(testConfig.prompt, model, testTools.vercelAiTools()); console.log("conversation", conversation); const toolCalls = testTools.getToolCalls(); console.log("?????? toolCalls", toolCalls); diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index d370633f..832aad30 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -1,58 +1,42 @@ -import { BaseChatModel } from "@langchain/core/language_models/chat_models"; -import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; -import { ChatOllama } from "@langchain/ollama"; -import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; +import { LanguageModelV1 } from "ai"; +import { createGoogleGenerativeAI } from "@himanshusinghs/google"; +import { ollama } from "ollama-ai-provider"; -type ToolResultForOllama = string; -export type AcceptableToolResponse = CallToolResult | ToolResultForOllama; - -export interface Model { +export interface Model

{ isAvailable(): boolean; - getLangChainModel(): M; - transformToolResult(callToolResult: CallToolResult): T; + getModel(): P; } -export class GeminiModel implements Model { +export class GeminiModel implements Model { constructor(readonly modelName: string) {} isAvailable(): boolean { return !!process.env.MDB_GEMINI_API_KEY; } - getLangChainModel(): ChatGoogleGenerativeAI { - return new ChatGoogleGenerativeAI({ - model: this.modelName, + getModel() { + return createGoogleGenerativeAI({ apiKey: process.env.MDB_GEMINI_API_KEY, - }); - } - - transformToolResult(callToolResult: CallToolResult) { - return callToolResult; + })(this.modelName); } } -export class OllamaModel implements Model { +export class OllamaModel implements Model { constructor(readonly modelName: string) {} isAvailable(): boolean { - return !!process.env.MDB_GEMINI_API_KEY; - } - - getLangChainModel(): ChatOllama { - return new ChatOllama({ - model: this.modelName, - }); + return true; } - transformToolResult(callToolResult: CallToolResult): ToolResultForOllama { - return JSON.stringify(callToolResult); + getModel() { + return ollama(this.modelName); } } const ALL_TESTABLE_MODELS = [ - // new GeminiModel("gemini-1.5-flash"), + new GeminiModel("gemini-1.5-flash"), // new GeminiModel("gemini-2.0-flash"), - new OllamaModel("qwen3:latest"), + // new OllamaModel("qwen3:latest"), ]; export type TestableModels = ReturnType; diff --git a/tests/accuracy/sdk/test-tools.ts b/tests/accuracy/sdk/test-tools.ts index 82719454..cb728a36 100644 --- a/tests/accuracy/sdk/test-tools.ts +++ b/tests/accuracy/sdk/test-tools.ts @@ -1,6 +1,6 @@ -import { jest } from "@jest/globals"; +import { JSONSchema7 } from "json-schema"; import { v4 as uuid } from "uuid"; -import { DynamicTool, tool as langChainTool } from "@langchain/core/tools"; +import { Tool as VercelTool, Schema, tool as createVercelTool, jsonSchema } from "ai"; import { Client } from "@modelcontextprotocol/sdk/client/index.js"; import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { CallToolResult, Tool } from "@modelcontextprotocol/sdk/types.js"; @@ -10,15 +10,22 @@ import { defaultTestConfig } from "../../integration/helpers.js"; import { Session } from "../../../src/session.js"; import { Telemetry } from "../../../src/telemetry/telemetry.js"; import { Server } from "../../../src/server.js"; -import { AcceptableToolResponse } from "./models.js"; import { ToolCall } from "./accuracy-scorers.js"; type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult; -type MockedToolResultGeneratorFn = jest.MockedFunction; -type MockedTools = Record; -export type ToolResultGenerators = Record; -export type LangChainTool = DynamicTool; -export type ToolResultTransformer = (toolResult: CallToolResult) => T; +export type MockedTools = Record; + +function getDefaultToolResultGeneratorFn(): ToolResultGeneratorFn { + return () => ({ + content: [ + { + type: "text", + text: `Mock implementation for tool not present`, + }, + ], + isError: true, + }); +} export class TestTools { private mockedTools: MockedTools = {}; @@ -26,15 +33,7 @@ export class TestTools { constructor(private readonly mcpTools: Tool[]) { for (const mcpTool of mcpTools) { - this.mockedTools[mcpTool.name] = jest.fn().mockReturnValue({ - content: [ - { - type: "text", - text: `Mock implementation for tool - ${mcpTool.name} not present`, - }, - ], - isError: true, - }); + this.mockedTools[mcpTool.name] = getDefaultToolResultGeneratorFn(); } } @@ -42,9 +41,9 @@ export class TestTools { return this.recordedToolCalls; } - mockTools(toolResultGenerators: ToolResultGenerators) { - for (const toolName in toolResultGenerators) { - const toolResultGeneratorFn = toolResultGenerators[toolName]; + mockTools(mockedTools: MockedTools) { + for (const toolName in mockedTools) { + const toolResultGeneratorFn = mockedTools[toolName]; if (!this.mockedTools[toolName]) { throw new Error(`Attempted to mock unrecognized tool - ${toolName}`); } @@ -53,49 +52,40 @@ export class TestTools { // Are you happy TS? continue; } - this.mockedTools[toolName] = jest.fn(toolResultGeneratorFn); + this.mockedTools[toolName] = toolResultGeneratorFn; } } - langChainTools( - transformToolResult: ToolResultTransformer - ): LangChainTool[] { - return this.mcpTools.map((mcpTool) => { - return langChainTool((...args) => { - console.log("????? args", args); - const [parameters, { runName, runId }] = args; - const toolCallId = typeof runId !== "undefined" ? `${runId}` : uuid(); - return this.langChainToolResultGenerator(`${runName}`, parameters, toolCallId, transformToolResult); - }, mcpTool); - }); - } - - private langChainToolResultGenerator( - tool: string, - parameters: unknown, - toolCallId: string, - transformToolResult: ToolResultTransformer - ): T { - this.recordedToolCalls.push({ - toolCallId: toolCallId, - toolName: tool, - parameters, - }); - const mockedToolResultGenerator = this.mockedTools[tool]; - if (!mockedToolResultGenerator) { - // log as well - return transformToolResult({ - content: [ - { - type: "text", - text: `Could not resolve tool generator for ${tool}`, - }, - ], - isError: true, + vercelAiTools(): Record>> { + const vercelTools: Record>> = {}; + for (const tool of this.mcpTools) { + vercelTools[tool.name] = createVercelTool({ + description: tool.description, + parameters: jsonSchema(tool.inputSchema as JSONSchema7), + // eslint-disable-next-line @typescript-eslint/require-await + execute: async (args: unknown) => { + this.recordedToolCalls.push({ + toolCallId: uuid(), + toolName: tool.name, + parameters: args, + }); + const toolResultGeneratorFn = this.mockedTools[tool.name]; + if (!toolResultGeneratorFn) { + return { + content: [ + { + type: "text", + text: `Could not resolve tool generator for ${tool.name}`, + }, + ], + }; + } + + return toolResultGeneratorFn(args); + }, }); } - - return transformToolResult(mockedToolResultGenerator(parameters)); + return vercelTools; } } diff --git a/tests/accuracy/sdk/tool-calling-agent.ts b/tests/accuracy/sdk/tool-calling-agent.ts deleted file mode 100644 index b9adedf5..00000000 --- a/tests/accuracy/sdk/tool-calling-agent.ts +++ /dev/null @@ -1,36 +0,0 @@ -import { ChatPromptTemplate } from "@langchain/core/prompts"; -import { createToolCallingAgent, AgentExecutor } from "langchain/agents"; - -import { LangChainTool } from "./test-tools.js"; -import { AcceptableToolResponse, Model } from "./models.js"; -import { BaseChatModel } from "@langchain/core/language_models/chat_models"; - -const prompt = ChatPromptTemplate.fromMessages([ - [ - "system", - [ - 'The keywords "MUST", "MUST NOT", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in RFC 2119 (https://www.ietf.org/rfc/rfc2119.txt)', - "You are an expect AI assistant with access to a set of tools for MongoDB database operations.", - "You MUST use the most relevant tool to answer the user's request", - "When calling a tool, you MUST strictly follow its input schema and MUST provide all required arguments", - "If a task requires multiple steps, you MUST call the necessary tools in sequence", - 'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"', - ].join("\n"), - ], - ["human", "{input}"], - ["placeholder", "{agent_scratchpad}"], -]); - -export function getToolCallingAgent( - model: Model, - tools: LangChainTool[] -) { - const llm = model.getLangChainModel(); - const agent = createToolCallingAgent({ - llm, - tools, - prompt, - }); - const agentExecutor = new AgentExecutor({ agent, tools }); - return agentExecutor; -} From 6f7b99a8e26933e3dd5d42f68efb0733d18bbf9a Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 30 Jun 2025 13:01:57 +0200 Subject: [PATCH 03/91] chore: integrate capturing accuracy snapshots --- package.json | 3 +- tests/accuracy/sdk/accuracy-snapshot.ts | 54 +++++++++++++++++++ tests/accuracy/sdk/describe-accuracy-tests.ts | 40 ++++++++++++-- tests/accuracy/sdk/models.ts | 7 +-- tests/accuracy/sdk/test-tools.ts | 3 -- 5 files changed, 95 insertions(+), 12 deletions(-) create mode 100644 tests/accuracy/sdk/accuracy-snapshot.ts diff --git a/package.json b/package.json index 5973a804..ce6b5c03 100644 --- a/package.json +++ b/package.json @@ -29,7 +29,8 @@ "check:types": "tsc --noEmit --project tsconfig.json", "reformat": "prettier --write .", "generate": "./scripts/generate.sh", - "test": "vitest --coverage" + "test": "vitest --coverage", + "test:accuracy": "node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern tests/accuracy" }, "license": "Apache-2.0", "devDependencies": { diff --git a/tests/accuracy/sdk/accuracy-snapshot.ts b/tests/accuracy/sdk/accuracy-snapshot.ts new file mode 100644 index 00000000..1f7867a9 --- /dev/null +++ b/tests/accuracy/sdk/accuracy-snapshot.ts @@ -0,0 +1,54 @@ +import fs from "fs/promises"; +import path from "path"; +import { z } from "zod"; + +export const SNAPSHOT_FILE_PATH = path.resolve(process.cwd(), "accuracy-snapshot.json"); + +export const AccuracySnapshotEntrySchema = z.object({ + datetime: z.string(), + commit: z.string(), + model: z.string(), + suite: z.string(), + test: z.string(), + toolCallingAccuracy: z.number(), + parameterAccuracy: z.number(), +}); + +export type AccuracySnapshotEntry = z.infer; + +export async function readSnapshot(): Promise { + try { + const raw = await fs.readFile(SNAPSHOT_FILE_PATH, "utf8"); + return AccuracySnapshotEntrySchema.array().parse(JSON.parse(raw)); + } catch (e: unknown) { + if ((e as { code: string }).code === "ENOENT") { + return []; + } + throw e; + } +} + +function waitFor(ms: number) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +export async function appendAccuracySnapshot(entry: AccuracySnapshotEntry): Promise { + AccuracySnapshotEntrySchema.parse(entry); + + for (let attempt = 0; attempt < 5; attempt++) { + try { + const snapshot = await readSnapshot(); + snapshot.unshift(entry); + const tmp = `${SNAPSHOT_FILE_PATH}~${Date.now()}`; + await fs.writeFile(tmp, JSON.stringify(snapshot, null, 2)); + await fs.rename(tmp, SNAPSHOT_FILE_PATH); + return; + } catch (e) { + if (attempt < 4) { + await waitFor(100 + Math.random() * 200); + } else { + throw e; + } + } + } +} diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 97496f6e..a3ad0668 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -3,6 +3,7 @@ import { discoverMongoDBTools, TestTools, MockedTools } from "./test-tools.js"; import { TestableModels } from "./models.js"; import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyScorer } from "./accuracy-scorers.js"; import { Agent, getVercelToolCallingAgent } from "./agent.js"; +import { appendAccuracySnapshot } from "./accuracy-snapshot.js"; interface AccuracyTestConfig { prompt: string; @@ -15,6 +16,20 @@ export function describeAccuracyTests( models: TestableModels, accuracyTestConfigs: AccuracyTestConfig[] ) { + const accuracyDatetime = process.env.ACCURACY_DATETIME; + if (!accuracyDatetime) { + throw new Error("ACCURACY_DATETIME environment variable is not set"); + } + const accuracyCommit = process.env.ACCURACY_COMMIT; + if (!accuracyCommit) { + throw new Error("ACCURACY_COMMIT environment variable is not set"); + } + + if (!models.length) { + console.warn(`No models available to test ${suiteName}`); + return; + } + const eachModel = describe.each(models); const eachTest = it.each(accuracyTestConfigs); @@ -35,15 +50,30 @@ export function describeAccuracyTests( eachTest("$prompt", async function (testConfig) { testTools.mockTools(testConfig.mockedTools); const conversation = await agent.prompt(testConfig.prompt, model, testTools.vercelAiTools()); - console.log("conversation", conversation); const toolCalls = testTools.getToolCalls(); - console.log("?????? toolCalls", toolCalls); - console.log("???? expected", testConfig.expectedToolCalls); const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); + await appendAccuracySnapshot({ + datetime: accuracyDatetime, + commit: accuracyCommit, + model: model.modelName, + suite: suiteName, + test: testConfig.prompt, + toolCallingAccuracy, + parameterAccuracy: parameterMatchingAccuracy, + }); - expect(toolCallingAccuracy).not.toEqual(0); - expect(parameterMatchingAccuracy).toBeGreaterThanOrEqual(0.5); + try { + expect(toolCallingAccuracy).not.toEqual(0); + expect(parameterMatchingAccuracy).toBeGreaterThanOrEqual(0.5); + } catch (error) { + console.warn(`Accuracy test failed for ${model.modelName} - ${suiteName} - ${testConfig.prompt}`); + console.warn(`Conversation`, JSON.stringify(conversation, null, 2)); + console.warn(`Tool calls`, JSON.stringify(toolCalls, null, 2)); + console.warn(`Tool calling accuracy`, toolCallingAccuracy); + console.warn(`Parameter matching accuracy`, parameterMatchingAccuracy); + throw error; + } }); }); } diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index 832aad30..27b8e972 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -3,6 +3,7 @@ import { createGoogleGenerativeAI } from "@himanshusinghs/google"; import { ollama } from "ollama-ai-provider"; export interface Model

{ + readonly modelName: string; isAvailable(): boolean; getModel(): P; } @@ -25,7 +26,7 @@ export class OllamaModel implements Model { constructor(readonly modelName: string) {} isAvailable(): boolean { - return true; + return false; } getModel() { @@ -35,8 +36,8 @@ export class OllamaModel implements Model { const ALL_TESTABLE_MODELS = [ new GeminiModel("gemini-1.5-flash"), - // new GeminiModel("gemini-2.0-flash"), - // new OllamaModel("qwen3:latest"), + new GeminiModel("gemini-2.0-flash"), + new OllamaModel("qwen3:latest"), ]; export type TestableModels = ReturnType; diff --git a/tests/accuracy/sdk/test-tools.ts b/tests/accuracy/sdk/test-tools.ts index cb728a36..595a9069 100644 --- a/tests/accuracy/sdk/test-tools.ts +++ b/tests/accuracy/sdk/test-tools.ts @@ -132,9 +132,6 @@ export async function discoverMongoDBTools(): Promise { await mcpClient.connect(clientTransport); return (await mcpClient.listTools()).tools; - } catch (error: unknown) { - console.error("Unexpected error occured", error); - return []; } finally { await mcpClient?.close(); await mcpServer?.session?.close(); From add4204c60e627f3572f9695a5b34ce34252f9f7 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 30 Jun 2025 13:03:50 +0200 Subject: [PATCH 04/91] chore: correct env names --- tests/accuracy/sdk/describe-accuracy-tests.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index a3ad0668..5d500ffa 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -16,13 +16,13 @@ export function describeAccuracyTests( models: TestableModels, accuracyTestConfigs: AccuracyTestConfig[] ) { - const accuracyDatetime = process.env.ACCURACY_DATETIME; + const accuracyDatetime = process.env.MDB_ACCURACY_DATETIME; if (!accuracyDatetime) { - throw new Error("ACCURACY_DATETIME environment variable is not set"); + throw new Error("MDB_ACCURACY_DATETIME environment variable is not set"); } - const accuracyCommit = process.env.ACCURACY_COMMIT; + const accuracyCommit = process.env.MDB_ACCURACY_COMMIT; if (!accuracyCommit) { - throw new Error("ACCURACY_COMMIT environment variable is not set"); + throw new Error("MDB_ACCURACY_COMMIT environment variable is not set"); } if (!models.length) { From f0c1d388a8034301ced21ea58b4c296e8c128dbf Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 30 Jun 2025 13:29:32 +0200 Subject: [PATCH 05/91] chore: more consolidated prompt tests --- tests/accuracy/list-databases.test.ts | 28 +++++++++---- tests/accuracy/sdk/agent.ts | 13 +++--- tests/accuracy/sdk/describe-accuracy-tests.ts | 41 ++++++++++--------- tests/accuracy/sdk/models.ts | 4 +- 4 files changed, 48 insertions(+), 38 deletions(-) diff --git a/tests/accuracy/list-databases.test.ts b/tests/accuracy/list-databases.test.ts index ae3f6c7d..d26fbc4e 100644 --- a/tests/accuracy/list-databases.test.ts +++ b/tests/accuracy/list-databases.test.ts @@ -1,16 +1,22 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -describeAccuracyTests("list-databases", getAvailableModels(), [ - { - prompt: "Assume that you're already connected. How many collections are there in sample_mflix database", +function describeListDatabasesAccuracyTests(prompt: string): AccuracyTestConfig { + return { + systemPrompt: "Assume that you're already connected.", + prompt: prompt, mockedTools: { - "list-collections": function listCollections() { + "list-databases": function listDatabases() { return { content: [ { type: "text", - text: "Name: coll1", + text: "Name: db1", + }, + { + type: "text", + text: "Name: db2", }, ], }; @@ -18,9 +24,15 @@ describeAccuracyTests("list-databases", getAvailableModels(), [ }, expectedToolCalls: [ { - toolName: "list-collections", - parameters: { database: "sample_mflix" }, + toolName: "list-databases", + parameters: {}, }, ], - }, + }; +} + +describeAccuracyTests("list-databases", getAvailableModels(), [ + describeListDatabasesAccuracyTests("How many databases do I have?"), + describeListDatabasesAccuracyTests("List all the databases in my cluster."), + describeListDatabasesAccuracyTests("Is there a sample_mflix database in my cluster?"), ]); diff --git a/tests/accuracy/sdk/agent.ts b/tests/accuracy/sdk/agent.ts index 905cfff9..eb680358 100644 --- a/tests/accuracy/sdk/agent.ts +++ b/tests/accuracy/sdk/agent.ts @@ -8,23 +8,20 @@ const systemPrompt = [ "When calling a tool, you MUST strictly follow its input schema and MUST provide all required arguments", "If a task requires multiple steps, you MUST call the necessary tools in sequence", 'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"', - "You SHOULD assume that you are already connected to a MongoDB connection", -].join("\n"); +]; export interface Agent { prompt(prompt: string, model: M, tools: T): Promise; } -export function getVercelToolCallingAgent(): Agent< - Model, - Record>>, - { text: string; messages: unknown[] } -> { +export function getVercelToolCallingAgent( + requestedSystemPrompt?: string +): Agent, Record>>, { text: string; messages: unknown[] }> { return { async prompt(prompt: string, model: Model, tools: Record>>) { const result = await generateText({ model: model.getModel(), - system: systemPrompt, + system: [...systemPrompt, requestedSystemPrompt].join("\n"), prompt, tools, maxSteps: 100, diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 5d500ffa..bf99d509 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -5,7 +5,8 @@ import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyS import { Agent, getVercelToolCallingAgent } from "./agent.js"; import { appendAccuracySnapshot } from "./accuracy-snapshot.js"; -interface AccuracyTestConfig { +export interface AccuracyTestConfig { + systemPrompt?: string; prompt: string; expectedToolCalls: ExpectedToolCall[]; mockedTools: MockedTools; @@ -17,13 +18,7 @@ export function describeAccuracyTests( accuracyTestConfigs: AccuracyTestConfig[] ) { const accuracyDatetime = process.env.MDB_ACCURACY_DATETIME; - if (!accuracyDatetime) { - throw new Error("MDB_ACCURACY_DATETIME environment variable is not set"); - } const accuracyCommit = process.env.MDB_ACCURACY_COMMIT; - if (!accuracyCommit) { - throw new Error("MDB_ACCURACY_COMMIT environment variable is not set"); - } if (!models.length) { console.warn(`No models available to test ${suiteName}`); @@ -53,25 +48,31 @@ export function describeAccuracyTests( const toolCalls = testTools.getToolCalls(); const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); - await appendAccuracySnapshot({ - datetime: accuracyDatetime, - commit: accuracyCommit, - model: model.modelName, - suite: suiteName, - test: testConfig.prompt, - toolCallingAccuracy, - parameterAccuracy: parameterMatchingAccuracy, - }); + if (accuracyDatetime && accuracyCommit) { + await appendAccuracySnapshot({ + datetime: accuracyDatetime, + commit: accuracyCommit, + model: model.modelName, + suite: suiteName, + test: testConfig.prompt, + toolCallingAccuracy, + parameterAccuracy: parameterMatchingAccuracy, + }); + } else { + console.info( + `Skipping accuracy snapshot update for ${model.modelName} - ${suiteName} - ${testConfig.prompt}` + ); + } try { expect(toolCallingAccuracy).not.toEqual(0); expect(parameterMatchingAccuracy).toBeGreaterThanOrEqual(0.5); } catch (error) { console.warn(`Accuracy test failed for ${model.modelName} - ${suiteName} - ${testConfig.prompt}`); - console.warn(`Conversation`, JSON.stringify(conversation, null, 2)); - console.warn(`Tool calls`, JSON.stringify(toolCalls, null, 2)); - console.warn(`Tool calling accuracy`, toolCallingAccuracy); - console.warn(`Parameter matching accuracy`, parameterMatchingAccuracy); + console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); + console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2)); + console.debug(`Tool calling accuracy`, toolCallingAccuracy); + console.debug(`Parameter matching accuracy`, parameterMatchingAccuracy); throw error; } }); diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index 27b8e972..f6f8a879 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -36,8 +36,8 @@ export class OllamaModel implements Model { const ALL_TESTABLE_MODELS = [ new GeminiModel("gemini-1.5-flash"), - new GeminiModel("gemini-2.0-flash"), - new OllamaModel("qwen3:latest"), + // new GeminiModel("gemini-2.0-flash"), + // new OllamaModel("qwen3:latest"), ]; export type TestableModels = ReturnType; From 8fe49428fae852a4082bf02c7200e8f9e42cce9d Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 30 Jun 2025 16:37:10 +0200 Subject: [PATCH 06/91] chore: add a few more tests and some more models --- tests/accuracy/list-collections.test.ts | 38 +++++++++++++++++++++++++ tests/accuracy/sdk/models.ts | 6 ++-- 2 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 tests/accuracy/list-collections.test.ts diff --git a/tests/accuracy/list-collections.test.ts b/tests/accuracy/list-collections.test.ts new file mode 100644 index 00000000..b871a96e --- /dev/null +++ b/tests/accuracy/list-collections.test.ts @@ -0,0 +1,38 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function describeListCollectionsAccuracyTests(prompt: string): AccuracyTestConfig { + return { + systemPrompt: "Assume that you're already connected.", + prompt: prompt, + mockedTools: { + "list-collections": function listCollections() { + return { + content: [ + { + type: "text", + text: "Name: coll1", + }, + { + type: "text", + text: "Name: coll1", + }, + ], + }; + }, + }, + expectedToolCalls: [ + { + toolName: "list-collections", + parameters: { database: "db1" }, + }, + ], + }; +} + +describeAccuracyTests("list-collections", getAvailableModels(), [ + describeListCollectionsAccuracyTests("How many collections do I have in database db1?"), + describeListCollectionsAccuracyTests("List all the collections in my MongoDB database db1."), + describeListCollectionsAccuracyTests("Is there a coll1 collection in my MongoDB database db1?"), +]); diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index f6f8a879..e3f5ab1f 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -26,7 +26,7 @@ export class OllamaModel implements Model { constructor(readonly modelName: string) {} isAvailable(): boolean { - return false; + return true; } getModel() { @@ -36,8 +36,8 @@ export class OllamaModel implements Model { const ALL_TESTABLE_MODELS = [ new GeminiModel("gemini-1.5-flash"), - // new GeminiModel("gemini-2.0-flash"), - // new OllamaModel("qwen3:latest"), + new GeminiModel("gemini-2.0-flash"), + new OllamaModel("qwen3:1.7b"), ]; export type TestableModels = ReturnType; From d220f227b005a1789272832b3ffd761fa89aad18 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 13:01:41 +0200 Subject: [PATCH 07/91] chore: add AzureOpenAI model in the model list --- package-lock.json | 35 +++++++++++++++++++ tests/accuracy/list-collections.test.ts | 2 +- tests/accuracy/list-databases.test.ts | 2 +- tests/accuracy/sdk/describe-accuracy-tests.ts | 6 +++- tests/accuracy/sdk/models.ts | 21 +++++++++-- 5 files changed, 61 insertions(+), 5 deletions(-) diff --git a/package-lock.json b/package-lock.json index 497cf564..f42c14c5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -59,6 +59,41 @@ "@himanshusinghs/ai-sdk-google": { "extraneous": true }, + "node_modules/@ai-sdk/azure": { + "version": "1.3.23", + "resolved": "https://registry.npmjs.org/@ai-sdk/azure/-/azure-1.3.23.tgz", + "integrity": "sha512-vpsaPtU24RBVk/IMM5UylR/N4RtAuL2NZLWc7LJ3tvMTHu6pI46a7w+1qIwR3F6yO9ehWR8qvfLaBefJNFxaVw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/openai": "1.3.22", + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, + "node_modules/@ai-sdk/openai": { + "version": "1.3.22", + "resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-1.3.22.tgz", + "integrity": "sha512-QwA+2EkG0QyjVR+7h6FE7iOu2ivNqAVMm9UJZkVxxTk5OIq5fFJDTEI/zICEMuHImTTXR2JjsL6EirJ28Jc4cw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, "node_modules/@ai-sdk/provider": { "version": "1.1.3", "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.3.tgz", diff --git a/tests/accuracy/list-collections.test.ts b/tests/accuracy/list-collections.test.ts index b871a96e..2bc11dea 100644 --- a/tests/accuracy/list-collections.test.ts +++ b/tests/accuracy/list-collections.test.ts @@ -4,7 +4,7 @@ import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function describeListCollectionsAccuracyTests(prompt: string): AccuracyTestConfig { return { - systemPrompt: "Assume that you're already connected.", + injectConnectedAssumption: true, prompt: prompt, mockedTools: { "list-collections": function listCollections() { diff --git a/tests/accuracy/list-databases.test.ts b/tests/accuracy/list-databases.test.ts index d26fbc4e..cf06303e 100644 --- a/tests/accuracy/list-databases.test.ts +++ b/tests/accuracy/list-databases.test.ts @@ -4,7 +4,7 @@ import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function describeListDatabasesAccuracyTests(prompt: string): AccuracyTestConfig { return { - systemPrompt: "Assume that you're already connected.", + injectConnectedAssumption: true, prompt: prompt, mockedTools: { "list-databases": function listDatabases() { diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index bf99d509..28fa3bd7 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -7,6 +7,7 @@ import { appendAccuracySnapshot } from "./accuracy-snapshot.js"; export interface AccuracyTestConfig { systemPrompt?: string; + injectConnectedAssumption?: boolean; prompt: string; expectedToolCalls: ExpectedToolCall[]; mockedTools: MockedTools; @@ -44,7 +45,10 @@ export function describeAccuracyTests( eachTest("$prompt", async function (testConfig) { testTools.mockTools(testConfig.mockedTools); - const conversation = await agent.prompt(testConfig.prompt, model, testTools.vercelAiTools()); + const promptForModel = testConfig.injectConnectedAssumption + ? [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" ") + : testConfig.prompt; + const conversation = await agent.prompt(promptForModel, model, testTools.vercelAiTools()); const toolCalls = testTools.getToolCalls(); const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index e3f5ab1f..c653c79c 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -1,5 +1,6 @@ import { LanguageModelV1 } from "ai"; import { createGoogleGenerativeAI } from "@himanshusinghs/google"; +import { createAzure } from "@ai-sdk/azure"; import { ollama } from "ollama-ai-provider"; export interface Model

{ @@ -8,6 +9,22 @@ export interface Model

{ getModel(): P; } +export class OpenAIModel implements Model { + constructor(readonly modelName: string) {} + + isAvailable(): boolean { + return !!process.env.MDB_AZURE_OPEN_AI_API_KEY && !!process.env.MDB_AZURE_OPEN_AI_API_URL; + } + + getModel() { + return createAzure({ + baseURL: process.env.MDB_AZURE_OPEN_AI_API_URL, + apiKey: process.env.MDB_AZURE_OPEN_AI_API_KEY, + apiVersion: "2024-12-01-preview", + })(this.modelName); + } +} + export class GeminiModel implements Model { constructor(readonly modelName: string) {} @@ -35,9 +52,9 @@ export class OllamaModel implements Model { } const ALL_TESTABLE_MODELS = [ - new GeminiModel("gemini-1.5-flash"), new GeminiModel("gemini-2.0-flash"), - new OllamaModel("qwen3:1.7b"), + new OpenAIModel("gpt-4o"), + // new OllamaModel("qwen3:1.7b"), ]; export type TestableModels = ReturnType; From 1c5842746f57df474e5c807dd259ac05a00da4ed Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 14:04:21 +0200 Subject: [PATCH 08/91] chore: use ListDatabasesTool response creator for tests --- src/tools/mongodb/metadata/listDatabases.ts | 23 +++++++++++---- tests/accuracy/list-databases.test.ts | 31 ++++++++++----------- 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/src/tools/mongodb/metadata/listDatabases.ts b/src/tools/mongodb/metadata/listDatabases.ts index 400f275b..c1022c5b 100644 --- a/src/tools/mongodb/metadata/listDatabases.ts +++ b/src/tools/mongodb/metadata/listDatabases.ts @@ -3,6 +3,17 @@ import { MongoDBToolBase } from "../mongodbTool.js"; import * as bson from "bson"; import { OperationType } from "../../tool.js"; +export function listDatabasesResponse(databases: { name: string; sizeOnDisk: string }[]): CallToolResult { + return { + content: databases.map((db) => { + return { + text: `Name: ${db.name}, Size: ${db.sizeOnDisk} bytes`, + type: "text", + }; + }), + }; +} + export class ListDatabasesTool extends MongoDBToolBase { public name = "list-databases"; protected description = "List all databases for a MongoDB connection"; @@ -13,13 +24,13 @@ export class ListDatabasesTool extends MongoDBToolBase { const provider = await this.ensureConnected(); const dbs = (await provider.listDatabases("")).databases as { name: string; sizeOnDisk: bson.Long }[]; - return { - content: dbs.map((db) => { + return listDatabasesResponse( + dbs.map((db) => { return { - text: `Name: ${db.name}, Size: ${db.sizeOnDisk.toString()} bytes`, - type: "text", + name: db.name, + sizeOnDisk: db.sizeOnDisk.toString(), }; - }), - }; + }) + ); } } diff --git a/tests/accuracy/list-databases.test.ts b/tests/accuracy/list-databases.test.ts index cf06303e..0a89db1d 100644 --- a/tests/accuracy/list-databases.test.ts +++ b/tests/accuracy/list-databases.test.ts @@ -1,25 +1,24 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { listDatabasesResponse } from "../../src/tools/mongodb/metadata/listDatabases.js"; -function describeListDatabasesAccuracyTests(prompt: string): AccuracyTestConfig { +function callsListDatabases(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, mockedTools: { "list-databases": function listDatabases() { - return { - content: [ - { - type: "text", - text: "Name: db1", - }, - { - type: "text", - text: "Name: db2", - }, - ], - }; + return listDatabasesResponse([ + { + name: "db1", + sizeOnDisk: "1024", + }, + { + name: "db2", + sizeOnDisk: "2048", + }, + ]); }, }, expectedToolCalls: [ @@ -32,7 +31,7 @@ function describeListDatabasesAccuracyTests(prompt: string): AccuracyTestConfig } describeAccuracyTests("list-databases", getAvailableModels(), [ - describeListDatabasesAccuracyTests("How many databases do I have?"), - describeListDatabasesAccuracyTests("List all the databases in my cluster."), - describeListDatabasesAccuracyTests("Is there a sample_mflix database in my cluster?"), + callsListDatabases("How many databases do I have?"), + callsListDatabases("List all the databases in my cluster."), + callsListDatabases("Is there a sample_mflix database in my cluster?"), ]); From 5ce954ed36b50e742e6de18fbdbeafb260e0398b Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 14:05:51 +0200 Subject: [PATCH 09/91] chore: use ListCollectionsTool response creators in tests --- src/tools/mongodb/metadata/listCollections.ts | 45 ++++++++------ tests/accuracy/list-collections.test.ts | 62 ++++++++++++++----- 2 files changed, 72 insertions(+), 35 deletions(-) diff --git a/src/tools/mongodb/metadata/listCollections.ts b/src/tools/mongodb/metadata/listCollections.ts index 9611d541..5aad19ab 100644 --- a/src/tools/mongodb/metadata/listCollections.ts +++ b/src/tools/mongodb/metadata/listCollections.ts @@ -2,6 +2,28 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; +export function listCollectionsResponse(database: string, collections: string[]): CallToolResult { + if (collections.length === 0) { + return { + content: [ + { + type: "text", + text: `No collections found for database "${database}". To create a collection, use the "create-collection" tool.`, + }, + ], + }; + } + + return { + content: collections.map((collection) => { + return { + text: `Name: "${collection}"`, + type: "text", + }; + }), + }; +} + export class ListCollectionsTool extends MongoDBToolBase { public name = "list-collections"; protected description = "List all collections for a given database"; @@ -15,24 +37,9 @@ export class ListCollectionsTool extends MongoDBToolBase { const provider = await this.ensureConnected(); const collections = await provider.listCollections(database); - if (collections.length === 0) { - return { - content: [ - { - type: "text", - text: `No collections found for database "${database}". To create a collection, use the "create-collection" tool.`, - }, - ], - }; - } - - return { - content: collections.map((collection) => { - return { - text: `Name: "${collection.name}"`, - type: "text", - }; - }), - }; + return listCollectionsResponse( + database, + collections.map((collection) => `${collection.name}`) + ); } } diff --git a/tests/accuracy/list-collections.test.ts b/tests/accuracy/list-collections.test.ts index 2bc11dea..ac086859 100644 --- a/tests/accuracy/list-collections.test.ts +++ b/tests/accuracy/list-collections.test.ts @@ -1,25 +1,16 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { listCollectionsResponse } from "../../src/tools/mongodb/metadata/listCollections.js"; +import { listDatabasesResponse } from "../../src/tools/mongodb/metadata/listDatabases.js"; -function describeListCollectionsAccuracyTests(prompt: string): AccuracyTestConfig { +function callsListCollections(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, mockedTools: { "list-collections": function listCollections() { - return { - content: [ - { - type: "text", - text: "Name: coll1", - }, - { - type: "text", - text: "Name: coll1", - }, - ], - }; + return listCollectionsResponse("db1", ["coll1", "coll2"]); }, }, expectedToolCalls: [ @@ -31,8 +22,47 @@ function describeListCollectionsAccuracyTests(prompt: string): AccuracyTestConfi }; } +function callsListDatabasesAndListCollections(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "list-collections": function listCollections() { + return listCollectionsResponse("db1", ["coll1", "coll2"]); + }, + "list-databases": function listDatabases() { + return listDatabasesResponse([ + { + name: "db1", + sizeOnDisk: "1024", + }, + { + name: "db2", + sizeOnDisk: "2048", + }, + ]); + }, + }, + expectedToolCalls: [ + { + toolName: "list-databases", + parameters: {}, + }, + { + toolName: "list-collections", + parameters: { database: "db1" }, + }, + { + toolName: "list-collections", + parameters: { database: "db2" }, + }, + ], + }; +} + describeAccuracyTests("list-collections", getAvailableModels(), [ - describeListCollectionsAccuracyTests("How many collections do I have in database db1?"), - describeListCollectionsAccuracyTests("List all the collections in my MongoDB database db1."), - describeListCollectionsAccuracyTests("Is there a coll1 collection in my MongoDB database db1?"), + callsListCollections("How many collections do I have in database db1?"), + callsListCollections("List all the collections in my MongoDB database db1."), + callsListCollections("Is there a coll1 collection in my MongoDB database db1?"), + callsListDatabasesAndListCollections("List all the collections that I have in total on my cluster?"), ]); From cfce25604483f103028a97b3abc662b82d67d7fd Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 14:30:50 +0200 Subject: [PATCH 10/91] chore: tests for collection-indexes tool --- src/tools/mongodb/read/collectionIndexes.ts | 74 ++++++++++++++------- tests/accuracy/collection-indexes.test.ts | 42 ++++++++++++ 2 files changed, 93 insertions(+), 23 deletions(-) create mode 100644 tests/accuracy/collection-indexes.test.ts diff --git a/src/tools/mongodb/read/collectionIndexes.ts b/src/tools/mongodb/read/collectionIndexes.ts index ef3fa75d..7d541128 100644 --- a/src/tools/mongodb/read/collectionIndexes.ts +++ b/src/tools/mongodb/read/collectionIndexes.ts @@ -2,6 +2,44 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; +export function collectionIndexesResponse({ + database, + collection, + indexes = [], + namespaceNotFound, +}: { + database: string; + collection: string; + indexes?: { name: string; key: string }[]; + namespaceNotFound?: boolean; +}): CallToolResult { + if (namespaceNotFound) { + return { + content: [ + { + text: `The indexes for "${database}.${collection}" cannot be determined because the collection does not exist.`, + type: "text", + }, + ], + }; + } + + return { + content: [ + { + text: `Found ${indexes.length} indexes in the collection "${collection}":`, + type: "text", + }, + ...(indexes.map((indexDefinition) => { + return { + text: `Name "${indexDefinition.name}", definition: ${JSON.stringify(indexDefinition.key)}`, + type: "text", + }; + }) as { text: string; type: "text" }[]), + ], + }; +} + export class CollectionIndexesTool extends MongoDBToolBase { public name = "collection-indexes"; protected description = "Describe the indexes for a collection"; @@ -11,21 +49,14 @@ export class CollectionIndexesTool extends MongoDBToolBase { protected async execute({ database, collection }: ToolArgs): Promise { const provider = await this.ensureConnected(); const indexes = await provider.getIndexes(database, collection); - - return { - content: [ - { - text: `Found ${indexes.length} indexes in the collection "${collection}":`, - type: "text", - }, - ...(indexes.map((indexDefinition) => { - return { - text: `Name "${indexDefinition.name}", definition: ${JSON.stringify(indexDefinition.key)}`, - type: "text", - }; - }) as { text: string; type: "text" }[]), - ], - }; + return collectionIndexesResponse({ + database, + collection, + indexes: indexes.map((index) => ({ + name: `${index.name}`, + key: JSON.stringify(index.key), + })), + }); } protected handleError( @@ -33,14 +64,11 @@ export class CollectionIndexesTool extends MongoDBToolBase { args: ToolArgs ): Promise | CallToolResult { if (error instanceof Error && "codeName" in error && error.codeName === "NamespaceNotFound") { - return { - content: [ - { - text: `The indexes for "${args.database}.${args.collection}" cannot be determined because the collection does not exist.`, - type: "text", - }, - ], - }; + return collectionIndexesResponse({ + database: args.database, + collection: args.collection, + namespaceNotFound: true, + }); } return super.handleError(error, args); diff --git a/tests/accuracy/collection-indexes.test.ts b/tests/accuracy/collection-indexes.test.ts new file mode 100644 index 00000000..78b223e3 --- /dev/null +++ b/tests/accuracy/collection-indexes.test.ts @@ -0,0 +1,42 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { collectionIndexesResponse } from "../../src/tools/mongodb/read/collectionIndexes.js"; + +function callsCollectionIndexes(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "collection-indexes": function collectionIndexes() { + return collectionIndexesResponse({ + database: "db1", + collection: "coll1", + indexes: [ + { + name: "year", + key: JSON.stringify({ _id: 1 }), + }, + ], + }); + }, + }, + expectedToolCalls: [ + { + toolName: "collection-indexes", + parameters: { + database: "db1", + collection: "coll1", + }, + }, + ], + }; +} + +describeAccuracyTests("collection-indexes", getAvailableModels(), [ + callsCollectionIndexes("How many indexes do I have in 'db1.coll1' namespace?"), + callsCollectionIndexes("List all the indexes in coll1 collection in db1 database"), + callsCollectionIndexes( + `Will this query: ${JSON.stringify({ year: 1994 })} on the namespace 'db1.coll1' be a collection scan?` + ), +]); From c3a0a724ec7d374e363c33c95b831a25a8271175 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 16:16:20 +0200 Subject: [PATCH 11/91] modify prompt for list-collections prompt and log tools provided --- tests/accuracy/collection-indexes.test.ts | 2 +- tests/accuracy/sdk/describe-accuracy-tests.ts | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/accuracy/collection-indexes.test.ts b/tests/accuracy/collection-indexes.test.ts index 78b223e3..683f386a 100644 --- a/tests/accuracy/collection-indexes.test.ts +++ b/tests/accuracy/collection-indexes.test.ts @@ -37,6 +37,6 @@ describeAccuracyTests("collection-indexes", getAvailableModels(), [ callsCollectionIndexes("How many indexes do I have in 'db1.coll1' namespace?"), callsCollectionIndexes("List all the indexes in coll1 collection in db1 database"), callsCollectionIndexes( - `Will this query: ${JSON.stringify({ year: 1994 })} on the namespace 'db1.coll1' be a collection scan?` + `Is the following query: ${JSON.stringify({ year: 1994 })} on the namespace 'db1.coll1' indexed?` ), ]); diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 28fa3bd7..972f10b3 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -45,10 +45,11 @@ export function describeAccuracyTests( eachTest("$prompt", async function (testConfig) { testTools.mockTools(testConfig.mockedTools); + const toolsForModel = testTools.vercelAiTools(); const promptForModel = testConfig.injectConnectedAssumption ? [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" ") : testConfig.prompt; - const conversation = await agent.prompt(promptForModel, model, testTools.vercelAiTools()); + const conversation = await agent.prompt(promptForModel, model, toolsForModel); const toolCalls = testTools.getToolCalls(); const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); @@ -73,6 +74,7 @@ export function describeAccuracyTests( expect(parameterMatchingAccuracy).toBeGreaterThanOrEqual(0.5); } catch (error) { console.warn(`Accuracy test failed for ${model.modelName} - ${suiteName} - ${testConfig.prompt}`); + console.debug(`Provided tools`, JSON.stringify(toolsForModel, null, 2)); console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2)); console.debug(`Tool calling accuracy`, toolCallingAccuracy); From c71ac447020ecf538844f442cdcc2a734ce09bc3 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 16:32:06 +0200 Subject: [PATCH 12/91] chore: have mock generators return Promise of ToolResult as well --- tests/accuracy/sdk/test-tools.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/accuracy/sdk/test-tools.ts b/tests/accuracy/sdk/test-tools.ts index 595a9069..15bb0420 100644 --- a/tests/accuracy/sdk/test-tools.ts +++ b/tests/accuracy/sdk/test-tools.ts @@ -12,7 +12,7 @@ import { Telemetry } from "../../../src/telemetry/telemetry.js"; import { Server } from "../../../src/server.js"; import { ToolCall } from "./accuracy-scorers.js"; -type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult; +type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult | Promise; export type MockedTools = Record; function getDefaultToolResultGeneratorFn(): ToolResultGeneratorFn { @@ -81,7 +81,7 @@ export class TestTools { }; } - return toolResultGeneratorFn(args); + return await toolResultGeneratorFn(args); }, }); } From f6a8fcdfc15f92a9a60ceaa7d5fd48ef501b609e Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 16:32:24 +0200 Subject: [PATCH 13/91] chore: tests for collection-schema tool --- .../mongodb/metadata/collectionSchema.ts | 59 +++++++++++-------- tests/accuracy/collection-schema.test.ts | 47 +++++++++++++++ 2 files changed, 80 insertions(+), 26 deletions(-) create mode 100644 tests/accuracy/collection-schema.test.ts diff --git a/src/tools/mongodb/metadata/collectionSchema.ts b/src/tools/mongodb/metadata/collectionSchema.ts index 693b8f91..2f419acb 100644 --- a/src/tools/mongodb/metadata/collectionSchema.ts +++ b/src/tools/mongodb/metadata/collectionSchema.ts @@ -1,7 +1,38 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; -import { getSimplifiedSchema } from "mongodb-schema"; +import { getSimplifiedSchema, SimplifiedSchema } from "mongodb-schema"; + +export function collectionSchemaResponse( + database: string, + collection: string, + schema: SimplifiedSchema +): CallToolResult { + const fieldsCount = Object.entries(schema).length; + if (fieldsCount === 0) { + return { + content: [ + { + text: `Could not deduce the schema for "${database}.${collection}". This may be because it doesn't exist or is empty.`, + type: "text", + }, + ], + }; + } + + return { + content: [ + { + text: `Found ${fieldsCount} fields in the schema for "${database}.${collection}"`, + type: "text", + }, + { + text: JSON.stringify(schema), + type: "text", + }, + ], + }; +} export class CollectionSchemaTool extends MongoDBToolBase { public name = "collection-schema"; @@ -14,30 +45,6 @@ export class CollectionSchemaTool extends MongoDBToolBase { const provider = await this.ensureConnected(); const documents = await provider.find(database, collection, {}, { limit: 5 }).toArray(); const schema = await getSimplifiedSchema(documents); - - const fieldsCount = Object.entries(schema).length; - if (fieldsCount === 0) { - return { - content: [ - { - text: `Could not deduce the schema for "${database}.${collection}". This may be because it doesn't exist or is empty.`, - type: "text", - }, - ], - }; - } - - return { - content: [ - { - text: `Found ${fieldsCount} fields in the schema for "${database}.${collection}"`, - type: "text", - }, - { - text: JSON.stringify(schema), - type: "text", - }, - ], - }; + return collectionSchemaResponse(database, collection, schema); } } diff --git a/tests/accuracy/collection-schema.test.ts b/tests/accuracy/collection-schema.test.ts new file mode 100644 index 00000000..e72c65de --- /dev/null +++ b/tests/accuracy/collection-schema.test.ts @@ -0,0 +1,47 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { collectionSchemaResponse } from "../../src/tools/mongodb/metadata/collectionSchema.js"; +import { getSimplifiedSchema } from "mongodb-schema"; + +function callsCollectionSchema(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "collection-schema": async function collectionSchema() { + return collectionSchemaResponse( + "db1", + "coll1", + await getSimplifiedSchema([ + { + name: "Sample name1", + dob: "28.11.2001", + location: "NY", + }, + { + name: "Sample name1", + dob: "28.11.2001", + location: "NY", + title: "Dr.", + }, + ]) + ); + }, + }, + expectedToolCalls: [ + { + toolName: "collection-schema", + parameters: { + database: "db1", + collection: "coll1", + }, + }, + ], + }; +} + +describeAccuracyTests("collection-schema", getAvailableModels(), [ + callsCollectionSchema("Is there a title field in 'db1.coll1' namespace?"), + callsCollectionSchema("What is the type of value stored in title field in coll1 collection in db1 database?"), +]); From ed0a6da87c5d49a05807a3982a0c42886d356d46 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 17:17:53 +0200 Subject: [PATCH 14/91] chore: do not fail tests on dropped accuracy --- tests/accuracy/sdk/describe-accuracy-tests.ts | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 972f10b3..c602bf96 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -53,6 +53,13 @@ export function describeAccuracyTests( const toolCalls = testTools.getToolCalls(); const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); + console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); + console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2)); + console.debug( + "Tool calling accuracy: %s, Parameter Accuracy: %s", + toolCallingAccuracy, + parameterMatchingAccuracy + ); if (accuracyDatetime && accuracyCommit) { await appendAccuracySnapshot({ datetime: accuracyDatetime, @@ -68,19 +75,6 @@ export function describeAccuracyTests( `Skipping accuracy snapshot update for ${model.modelName} - ${suiteName} - ${testConfig.prompt}` ); } - - try { - expect(toolCallingAccuracy).not.toEqual(0); - expect(parameterMatchingAccuracy).toBeGreaterThanOrEqual(0.5); - } catch (error) { - console.warn(`Accuracy test failed for ${model.modelName} - ${suiteName} - ${testConfig.prompt}`); - console.debug(`Provided tools`, JSON.stringify(toolsForModel, null, 2)); - console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); - console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2)); - console.debug(`Tool calling accuracy`, toolCallingAccuracy); - console.debug(`Parameter matching accuracy`, parameterMatchingAccuracy); - throw error; - } }); }); } From c6da0b550a4e965664541926a300d532695bf5d7 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 17:17:58 +0200 Subject: [PATCH 15/91] chore: added tests for find tool --- src/tools/mongodb/read/find.ts | 36 ++++---- tests/accuracy/find.test.ts | 157 +++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+), 17 deletions(-) create mode 100644 tests/accuracy/find.test.ts diff --git a/src/tools/mongodb/read/find.ts b/src/tools/mongodb/read/find.ts index 02c337ed..efad0eb9 100644 --- a/src/tools/mongodb/read/find.ts +++ b/src/tools/mongodb/read/find.ts @@ -13,7 +13,7 @@ export const FindArgs = { .describe("The query filter, matching the syntax of the query argument of db.collection.find()"), projection: z .record(z.string(), z.unknown()) - .optional() + // .optional() .describe("The projection, matching the syntax of the projection argument of db.collection.find()"), limit: z.number().optional().default(10).describe("The maximum number of documents to return"), sort: z @@ -22,6 +22,23 @@ export const FindArgs = { .describe("A document, describing the sort order, matching the syntax of the sort argument of cursor.sort()"), }; +export function findResponse(collection: string, documents: unknown[]): CallToolResult { + return { + content: [ + { + text: `Found ${documents.length} documents in the collection "${collection}":`, + type: "text", + }, + ...documents.map<{ type: "text"; text: string }>((doc) => { + return { + text: EJSON.stringify(doc), + type: "text", + }; + }), + ], + }; +} + export class FindTool extends MongoDBToolBase { public name = "find"; protected description = "Run a find query against a MongoDB collection"; @@ -50,21 +67,6 @@ export class FindTool extends MongoDBToolBase { const documents = await provider.find(database, collection, filter, { projection, limit, sort }).toArray(); - const content: Array<{ text: string; type: "text" }> = [ - { - text: `Found ${documents.length} documents in the collection "${collection}":`, - type: "text", - }, - ...documents.map((doc) => { - return { - text: EJSON.stringify(doc), - type: "text", - } as { text: string; type: "text" }; - }), - ]; - - return { - content, - }; + return findResponse(collection, documents); } } diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts new file mode 100644 index 00000000..0144e22b --- /dev/null +++ b/tests/accuracy/find.test.ts @@ -0,0 +1,157 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { findResponse } from "../../src/tools/mongodb/read/find.js"; +import { MockedTools } from "./sdk/test-tools.js"; +import { collectionSchemaResponse } from "../../src/tools/mongodb/metadata/collectionSchema.js"; +import { getSimplifiedSchema } from "mongodb-schema"; + +const documents = [ + { + title: "book1", + author: "author1", + date_of_publish: "01.01.1990", + }, + { + title: "book2", + author: "author1", + date_of_publish: "01.01.1992", + }, + { + title: "book3", + author: "author2", + date_of_publish: "01.01.1990", + }, +]; + +function callsFindNoFilter(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "collection-schema": async () => + collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), + find: () => findResponse("coll1", documents), + }, + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "db1", + collection: "coll1", + }, + }, + ], + }; +} + +function callsFindWithFilter(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "collection-schema": async () => + collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), + find: () => + findResponse( + "coll1", + documents.filter((doc) => doc.author === "author1") + ), + }, + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "db1", + collection: "coll1", + filter: { author: "author1" }, + }, + }, + ], + }; +} + +function callsFindWithProjection(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "collection-schema": async () => + collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), + find: () => findResponse("coll1", documents), + }, + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "db1", + collection: "coll1", + projection: { title: 1 }, + }, + }, + ], + }; +} + +function callsFindWithProjectionAndFilters(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "collection-schema": async () => + collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), + find: () => + findResponse( + "coll1", + documents.filter((doc) => doc.date_of_publish === "01.01.1992") + ), + }, + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "db1", + collection: "coll1", + filter: { date_of_publish: "01.01.1992" }, + projection: { title: 1 }, + }, + }, + ], + }; +} + +function callsFindWithSortAndLimit(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "collection-schema": async () => + collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), + find: () => findResponse("coll1", [documents[0], documents[1]]), + }, + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "db1", + collection: "coll1", + sort: { date_of_publish: 1 }, + limit: 2, + }, + }, + ], + }; +} + +describeAccuracyTests("find", getAvailableModels(), [ + callsFindNoFilter("List all the documents in 'db1.coll1' namespace"), + callsFindNoFilter("Find all the documents from collection coll1 in database db1"), + callsFindWithFilter("Find all the books published by author name 'author1' in db1.coll1 namespace"), + callsFindWithFilter("Find all the documents in coll1 collection and db1 database where author is 'author1'"), + callsFindWithProjection("Give me all the title of the books available in 'db1.coll1' namespace"), + callsFindWithProjection("Give me all the title of the books published in available in 'db1.coll1' namespace"), + callsFindWithProjectionAndFilters( + "Find all the book titles from 'db1.coll1' namespace where date_of_publish is '01.01.1992'" + ), + callsFindWithSortAndLimit("List first two books sorted by the field date_of_publish in namespace db1.coll1"), +]); From 774640bbbf78a1eeb0f1209ab4d5c439c11f0a9f Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 3 Jul 2025 15:00:13 +0200 Subject: [PATCH 16/91] chore: tests for insert-many tool --- src/tools/mongodb/create/insertMany.ts | 28 +++++----- tests/accuracy/insert-many.test.ts | 72 ++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 12 deletions(-) create mode 100644 tests/accuracy/insert-many.test.ts diff --git a/src/tools/mongodb/create/insertMany.ts b/src/tools/mongodb/create/insertMany.ts index 4744e344..e8937825 100644 --- a/src/tools/mongodb/create/insertMany.ts +++ b/src/tools/mongodb/create/insertMany.ts @@ -3,6 +3,21 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; +export function insertManyResponse(collection: string, insertedCount: number, insertedIds: unknown[]): CallToolResult { + return { + content: [ + { + text: `Inserted \`${insertedCount}\` document(s) into collection "${collection}"`, + type: "text", + }, + { + text: `Inserted IDs: ${insertedIds.join(", ")}`, + type: "text", + }, + ], + }; +} + export class InsertManyTool extends MongoDBToolBase { public name = "insert-many"; protected description = "Insert an array of documents into a MongoDB collection"; @@ -24,17 +39,6 @@ export class InsertManyTool extends MongoDBToolBase { const provider = await this.ensureConnected(); const result = await provider.insertMany(database, collection, documents); - return { - content: [ - { - text: `Inserted \`${result.insertedCount}\` document(s) into collection "${collection}"`, - type: "text", - }, - { - text: `Inserted IDs: ${Object.values(result.insertedIds).join(", ")}`, - type: "text", - }, - ], - }; + return insertManyResponse(collection, result.insertedCount, Object.values(result.insertedIds)); } } diff --git a/tests/accuracy/insert-many.test.ts b/tests/accuracy/insert-many.test.ts new file mode 100644 index 00000000..b720ac1c --- /dev/null +++ b/tests/accuracy/insert-many.test.ts @@ -0,0 +1,72 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { insertManyResponse } from "../../src/tools/mongodb/create/insertMany.js"; + +function callsInsertMany(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "insert-many": function listDatabases() { + return insertManyResponse("coll1", 3, ["1FOO", "2BAR", "3BAZ"]); + }, + }, + expectedToolCalls: [ + { + toolName: "insert-many", + parameters: { + database: "db1", + collection: "coll1", + documents: [ + { + id: 1, + name: "name1", + }, + { + id: 2, + name: "name2", + }, + { + id: 3, + name: "name3", + }, + ], + }, + }, + ], + }; +} + +function callsEmptyInsertMany(prompt: string) { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "insert-many": function listDatabases() { + return insertManyResponse("coll1", 3, ["1FOO", "2BAR", "3BAZ"]); + }, + }, + expectedToolCalls: [ + { + toolName: "insert-many", + parameters: { + database: "db1", + collection: "coll1", + documents: [{}, {}, {}], + }, + }, + ], + }; +} + +describeAccuracyTests("insert-many", getAvailableModels(), [ + callsInsertMany( + [ + "In my namespace 'db1.coll1', insert 3 documents each with the following fields:", + "- id: an incremental number starting from 1", + "- name: a string of format 'name'", + ].join("\n") + ), + callsEmptyInsertMany("Add three empty documents in collection 'coll1' inside database 'db1'"), +]); From 6e894bc3d8c71c3f75573d907529a5e069b34220 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 3 Jul 2025 15:06:30 +0200 Subject: [PATCH 17/91] chore: tests for delete-many tool --- src/tools/mongodb/delete/deleteMany.ts | 20 ++++++---- tests/accuracy/delete-many.test.ts | 53 ++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 8 deletions(-) create mode 100644 tests/accuracy/delete-many.test.ts diff --git a/src/tools/mongodb/delete/deleteMany.ts b/src/tools/mongodb/delete/deleteMany.ts index aa135512..df02094b 100644 --- a/src/tools/mongodb/delete/deleteMany.ts +++ b/src/tools/mongodb/delete/deleteMany.ts @@ -4,6 +4,17 @@ import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; import { checkIndexUsage } from "../../../helpers/indexCheck.js"; +export function deleteManyResponse(collection: string, delectedCount: number): CallToolResult { + return { + content: [ + { + text: `Deleted \`${delectedCount}\` document(s) from collection "${collection}"`, + type: "text", + }, + ], + }; +} + export class DeleteManyTool extends MongoDBToolBase { public name = "delete-many"; protected description = "Removes all documents that match the filter from a MongoDB collection"; @@ -45,13 +56,6 @@ export class DeleteManyTool extends MongoDBToolBase { const result = await provider.deleteMany(database, collection, filter); - return { - content: [ - { - text: `Deleted \`${result.deletedCount}\` document(s) from collection "${collection}"`, - type: "text", - }, - ], - }; + return deleteManyResponse(collection, result.deletedCount); } } diff --git a/tests/accuracy/delete-many.test.ts b/tests/accuracy/delete-many.test.ts new file mode 100644 index 00000000..ddda1d50 --- /dev/null +++ b/tests/accuracy/delete-many.test.ts @@ -0,0 +1,53 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { deleteManyResponse } from "../../src/tools/mongodb/delete/deleteMany.js"; + +function callsDeleteManyWithEmptyFilters(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "delete-many": function listDatabases() { + return deleteManyResponse("coll1", 10); + }, + }, + expectedToolCalls: [ + { + toolName: "delete-many", + parameters: { + database: "db1", + collection: "coll1", + }, + }, + ], + }; +} + +function callsDeleteManyWithFilters(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "delete-many": function listDatabases() { + return deleteManyResponse("coll1", 10); + }, + }, + expectedToolCalls: [ + { + toolName: "delete-many", + parameters: { + database: "db1", + collection: "coll1", + filters: { provider: "BongoDB" }, + }, + }, + ], + }; +} + +describeAccuracyTests("delete-many", getAvailableModels(), [ + callsDeleteManyWithEmptyFilters("Delete all the documents from 'db1.coll1' namespace"), + callsDeleteManyWithEmptyFilters("Purge the collection 'coll1' in database 'db1'"), + callsDeleteManyWithFilters("Remove all the documents from namespace 'db1.coll1' where provider is 'BongoDB'"), +]); From 942bfc062cb0c4e7708a62846064f151992ce3da Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 3 Jul 2025 16:05:57 +0200 Subject: [PATCH 18/91] chore: add oepnai provider --- package-lock.json | 17 +++++++++++++++++ tests/accuracy/sdk/models.ts | 18 +++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/package-lock.json b/package-lock.json index f42c14c5..0f20cf48 100644 --- a/package-lock.json +++ b/package-lock.json @@ -59,6 +59,23 @@ "@himanshusinghs/ai-sdk-google": { "extraneous": true }, + "node_modules/@ai-sdk/anthropic": { + "version": "1.2.12", + "resolved": "https://registry.npmjs.org/@ai-sdk/anthropic/-/anthropic-1.2.12.tgz", + "integrity": "sha512-YSzjlko7JvuiyQFmI9RN1tNZdEiZxc+6xld/0tq/VkJaHpEzGAb1yiNxxvmYVcjvfu/PcvCxAAYXmTYQQ63IHQ==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, "node_modules/@ai-sdk/azure": { "version": "1.3.23", "resolved": "https://registry.npmjs.org/@ai-sdk/azure/-/azure-1.3.23.tgz", diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index c653c79c..1fe4fd58 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -1,6 +1,7 @@ import { LanguageModelV1 } from "ai"; import { createGoogleGenerativeAI } from "@himanshusinghs/google"; import { createAzure } from "@ai-sdk/azure"; +import { createOpenAI } from "@ai-sdk/openai"; import { ollama } from "ollama-ai-provider"; export interface Model

{ @@ -12,6 +13,20 @@ export interface Model

{ export class OpenAIModel implements Model { constructor(readonly modelName: string) {} + isAvailable(): boolean { + return !!process.env.MDB_OPEN_AI_API_KEY; + } + + getModel() { + return createOpenAI({ + apiKey: process.env.MDB_OPEN_AI_API_KEY, + })(this.modelName); + } +} + +export class AzureOpenAIModel implements Model { + constructor(readonly modelName: string) {} + isAvailable(): boolean { return !!process.env.MDB_AZURE_OPEN_AI_API_KEY && !!process.env.MDB_AZURE_OPEN_AI_API_URL; } @@ -53,7 +68,8 @@ export class OllamaModel implements Model { const ALL_TESTABLE_MODELS = [ new GeminiModel("gemini-2.0-flash"), - new OpenAIModel("gpt-4o"), + // new OpenAIModel("gpt-4o"), + // new AzureOpenAIModel("gpt-4o"), // new OllamaModel("qwen3:1.7b"), ]; From 34bd4c2cf00579092b73cb6223ab43d0a2c25be7 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Fri, 4 Jul 2025 07:28:52 +0200 Subject: [PATCH 19/91] chore: fixes accuracy scorer for position independent matching --- tests/accuracy/sdk/accuracy-scorers.ts | 39 ++++++++++++++++---------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/tests/accuracy/sdk/accuracy-scorers.ts b/tests/accuracy/sdk/accuracy-scorers.ts index bf92eead..7bd8b969 100644 --- a/tests/accuracy/sdk/accuracy-scorers.ts +++ b/tests/accuracy/sdk/accuracy-scorers.ts @@ -37,28 +37,37 @@ export function parameterMatchingAccuracyScorer( return 1; } - const toolCallScores: number[] = []; - const checkedToolCallIds = new Set(); + const usedActualIndexes = new Set(); + const scores: number[] = []; - for (const expectedToolCall of expectedToolCalls) { - const matchingActualToolCall = actualToolCalls.find( - (actualToolCall) => - actualToolCall.toolName === expectedToolCall.toolName && - !checkedToolCallIds.has(actualToolCall.toolCallId) - ); + for (const expectedCall of expectedToolCalls) { + // Find all unmatched actual tool calls with the same tool name + const candidates = actualToolCalls + .map((call, index) => ({ call, index })) + .filter(({ call, index }) => !usedActualIndexes.has(index) && call.toolName === expectedCall.toolName); - if (!matchingActualToolCall) { - toolCallScores.push(0); + if (candidates.length === 0) { + scores.push(0); continue; } - checkedToolCallIds.add(matchingActualToolCall.toolCallId); - const score = compareParams(expectedToolCall.parameters, matchingActualToolCall.parameters); - toolCallScores.push(score); + // Pick the candidate with the best parameter match + let bestScore = -1; + let bestIndex = -1; + for (const { call, index } of candidates) { + const score = compareParams(expectedCall.parameters, call.parameters); + if (score > bestScore) { + bestScore = score; + bestIndex = index; + } + } + + usedActualIndexes.add(bestIndex); + scores.push(bestScore); } - const totalScore = toolCallScores.reduce((sum, score) => sum + score, 0); - return totalScore / toolCallScores.length; + const totalScore = scores.reduce((sum, score) => sum + score, 0); + return totalScore / scores.length; } /** From 537fe2a929fe135178a6dbcc8ce45a9c855a8c3d Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Fri, 4 Jul 2025 07:34:19 +0200 Subject: [PATCH 20/91] chore: replace mock mcp client with real (mockable) mcp client When writing test cases, I realized that it is too much duplicated effort to write and maintain mocks. So instead of having only a mocked mcp client, this commit introduces a real mcp client that talks to our mcp server and is still mockable. We are now setting up real MCP client with test data in mongodb database spun up for test suites. Mocking is still an option but we likely never feel the need for that. --- tests/accuracy/sdk/accuracy-testing-client.ts | 76 ++ tests/accuracy/sdk/describe-accuracy-tests.ts | 98 +-- tests/accuracy/sdk/test-tools.ts | 140 ---- .../test-data-dumps/comics.books.json | 608 ++++++++++++++ .../test-data-dumps/comics.characters.json | 576 ++++++++++++++ .../test-data-dumps/mflix.movies.json | 687 ++++++++++++++++ .../accuracy/test-data-dumps/mflix.shows.json | 750 ++++++++++++++++++ .../tools/mongodb/mongodbHelpers.ts | 53 +- 8 files changed, 2799 insertions(+), 189 deletions(-) create mode 100644 tests/accuracy/sdk/accuracy-testing-client.ts delete mode 100644 tests/accuracy/sdk/test-tools.ts create mode 100644 tests/accuracy/test-data-dumps/comics.books.json create mode 100644 tests/accuracy/test-data-dumps/comics.characters.json create mode 100644 tests/accuracy/test-data-dumps/mflix.movies.json create mode 100644 tests/accuracy/test-data-dumps/mflix.shows.json diff --git a/tests/accuracy/sdk/accuracy-testing-client.ts b/tests/accuracy/sdk/accuracy-testing-client.ts new file mode 100644 index 00000000..de7a0671 --- /dev/null +++ b/tests/accuracy/sdk/accuracy-testing-client.ts @@ -0,0 +1,76 @@ +import path from "path"; +import { v4 as uuid } from "uuid"; +import { fileURLToPath } from "url"; +import { experimental_createMCPClient as createMCPClient, tool as createVercelTool } from "ai"; +import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; +import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; + +import { ToolCall } from "./accuracy-scorers.js"; + +const __dirname = fileURLToPath(import.meta.url); +const distPath = path.join(__dirname, "..", "..", "..", "..", "dist"); +const cliScriptPath = path.join(distPath, "index.js"); + +type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult | Promise; +export type MockedTools = Record; + +export class AccuracyTestingClient { + private mockedTools: MockedTools = {}; + private recordedToolCalls: ToolCall[] = []; + private constructor(private readonly client: Awaited>) {} + + async close() { + await this.client?.close(); + } + + async vercelTools() { + const vercelTools = (await this.client?.tools()) ?? {}; + const rewrappedVercelTools: typeof vercelTools = {}; + for (const [toolName, tool] of Object.entries(vercelTools)) { + rewrappedVercelTools[toolName] = createVercelTool({ + ...tool, + execute: async (args, options) => { + this.recordedToolCalls.push({ + toolCallId: uuid(), + toolName: toolName, + parameters: args, + }); + const toolResultGeneratorFn = this.mockedTools[toolName]; + if (toolResultGeneratorFn) { + return await toolResultGeneratorFn(args); + } + + return tool.execute(args, options); + }, + }); + } + + return rewrappedVercelTools; + } + + getToolCalls() { + return this.recordedToolCalls; + } + + mockTools(mockedTools: MockedTools) { + this.mockedTools = mockedTools; + } + + resetForTests() { + this.mockTools({}); + this.recordedToolCalls = []; + } + + static async initializeClient(mdbConnectionString: string) { + const clientTransport = new StdioClientTransport({ + command: process.execPath, + args: [cliScriptPath, "--connectionString", mdbConnectionString], + }); + + const client = await createMCPClient({ + transport: clientTransport, + }); + + return new AccuracyTestingClient(client); + } +} diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index c602bf96..dd224387 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -1,9 +1,8 @@ -import { Tool } from "@modelcontextprotocol/sdk/types.js"; -import { discoverMongoDBTools, TestTools, MockedTools } from "./test-tools.js"; import { TestableModels } from "./models.js"; import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyScorer } from "./accuracy-scorers.js"; import { Agent, getVercelToolCallingAgent } from "./agent.js"; -import { appendAccuracySnapshot } from "./accuracy-snapshot.js"; +import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/tools/mongodb/mongodbHelpers.js"; +import { AccuracyTestingClient, MockedTools } from "./accuracy-testing-client.js"; export interface AccuracyTestConfig { systemPrompt?: string; @@ -13,68 +12,71 @@ export interface AccuracyTestConfig { mockedTools: MockedTools; } +export function describeSuite(suiteName: string, testConfigs: AccuracyTestConfig[]) { + return { + [suiteName]: testConfigs, + }; +} + export function describeAccuracyTests( - suiteName: string, models: TestableModels, - accuracyTestConfigs: AccuracyTestConfig[] + accuracyTestConfigs: { + [suiteName: string]: AccuracyTestConfig[]; + } ) { - const accuracyDatetime = process.env.MDB_ACCURACY_DATETIME; - const accuracyCommit = process.env.MDB_ACCURACY_COMMIT; - if (!models.length) { - console.warn(`No models available to test ${suiteName}`); - return; + throw new Error("No models available to test!"); } const eachModel = describe.each(models); - const eachTest = it.each(accuracyTestConfigs); + const eachSuite = describe.each(Object.keys(accuracyTestConfigs)); + + eachModel(`$modelName`, function (model) { + const mdbIntegration = setupMongoDBIntegrationTest(); + const populateTestData = prepareTestData(mdbIntegration); - eachModel(`$modelName - ${suiteName}`, function (model) { - let mcpTools: Tool[]; - let testTools: TestTools; + let testMCPClient: AccuracyTestingClient; let agent: Agent; beforeAll(async () => { - mcpTools = await discoverMongoDBTools(); + testMCPClient = await AccuracyTestingClient.initializeClient(mdbIntegration.connectionString()); + agent = getVercelToolCallingAgent(); }); - beforeEach(() => { - testTools = new TestTools(mcpTools); - agent = getVercelToolCallingAgent(); + beforeEach(async () => { + await populateTestData(); + testMCPClient.resetForTests(); + }); + + afterAll(async () => { + await testMCPClient.close(); }); - eachTest("$prompt", async function (testConfig) { - testTools.mockTools(testConfig.mockedTools); - const toolsForModel = testTools.vercelAiTools(); - const promptForModel = testConfig.injectConnectedAssumption - ? [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" ") - : testConfig.prompt; - const conversation = await agent.prompt(promptForModel, model, toolsForModel); - const toolCalls = testTools.getToolCalls(); - const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); - const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); - console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); - console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2)); - console.debug( - "Tool calling accuracy: %s, Parameter Accuracy: %s", - toolCallingAccuracy, - parameterMatchingAccuracy - ); - if (accuracyDatetime && accuracyCommit) { - await appendAccuracySnapshot({ - datetime: accuracyDatetime, - commit: accuracyCommit, - model: model.modelName, - suite: suiteName, - test: testConfig.prompt, + eachSuite("%s", function (suiteName) { + const eachTest = it.each(accuracyTestConfigs[suiteName] ?? []); + + eachTest("$prompt", async function (testConfig) { + testMCPClient.mockTools(testConfig.mockedTools); + const toolsForModel = await testMCPClient.vercelTools(); + const promptForModel = testConfig.injectConnectedAssumption + ? [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" ") + : testConfig.prompt; + const conversation = await agent.prompt(promptForModel, model, toolsForModel); + const toolCalls = testMCPClient.getToolCalls(); + const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); + const parameterMatchingAccuracy = parameterMatchingAccuracyScorer( + testConfig.expectedToolCalls, + toolCalls + ); + console.debug(testConfig.prompt); + console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); + // console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2)); + console.debug( + "Tool calling accuracy: %s, Parameter Accuracy: %s", toolCallingAccuracy, - parameterAccuracy: parameterMatchingAccuracy, - }); - } else { - console.info( - `Skipping accuracy snapshot update for ${model.modelName} - ${suiteName} - ${testConfig.prompt}` + parameterMatchingAccuracy ); - } + }); }); }); } diff --git a/tests/accuracy/sdk/test-tools.ts b/tests/accuracy/sdk/test-tools.ts deleted file mode 100644 index 15bb0420..00000000 --- a/tests/accuracy/sdk/test-tools.ts +++ /dev/null @@ -1,140 +0,0 @@ -import { JSONSchema7 } from "json-schema"; -import { v4 as uuid } from "uuid"; -import { Tool as VercelTool, Schema, tool as createVercelTool, jsonSchema } from "ai"; -import { Client } from "@modelcontextprotocol/sdk/client/index.js"; -import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; -import { CallToolResult, Tool } from "@modelcontextprotocol/sdk/types.js"; - -import { InMemoryTransport } from "../../integration/inMemoryTransport.js"; -import { defaultTestConfig } from "../../integration/helpers.js"; -import { Session } from "../../../src/session.js"; -import { Telemetry } from "../../../src/telemetry/telemetry.js"; -import { Server } from "../../../src/server.js"; -import { ToolCall } from "./accuracy-scorers.js"; - -type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult | Promise; -export type MockedTools = Record; - -function getDefaultToolResultGeneratorFn(): ToolResultGeneratorFn { - return () => ({ - content: [ - { - type: "text", - text: `Mock implementation for tool not present`, - }, - ], - isError: true, - }); -} - -export class TestTools { - private mockedTools: MockedTools = {}; - private recordedToolCalls: ToolCall[] = []; - - constructor(private readonly mcpTools: Tool[]) { - for (const mcpTool of mcpTools) { - this.mockedTools[mcpTool.name] = getDefaultToolResultGeneratorFn(); - } - } - - getToolCalls() { - return this.recordedToolCalls; - } - - mockTools(mockedTools: MockedTools) { - for (const toolName in mockedTools) { - const toolResultGeneratorFn = mockedTools[toolName]; - if (!this.mockedTools[toolName]) { - throw new Error(`Attempted to mock unrecognized tool - ${toolName}`); - } - - if (!toolResultGeneratorFn) { - // Are you happy TS? - continue; - } - this.mockedTools[toolName] = toolResultGeneratorFn; - } - } - - vercelAiTools(): Record>> { - const vercelTools: Record>> = {}; - for (const tool of this.mcpTools) { - vercelTools[tool.name] = createVercelTool({ - description: tool.description, - parameters: jsonSchema(tool.inputSchema as JSONSchema7), - // eslint-disable-next-line @typescript-eslint/require-await - execute: async (args: unknown) => { - this.recordedToolCalls.push({ - toolCallId: uuid(), - toolName: tool.name, - parameters: args, - }); - const toolResultGeneratorFn = this.mockedTools[tool.name]; - if (!toolResultGeneratorFn) { - return { - content: [ - { - type: "text", - text: `Could not resolve tool generator for ${tool.name}`, - }, - ], - }; - } - - return await toolResultGeneratorFn(args); - }, - }); - } - return vercelTools; - } -} - -export async function discoverMongoDBTools(): Promise { - let mcpClient: Client | undefined; - let mcpServer: Server | undefined; - try { - const serverTransport = new InMemoryTransport(); - const clientTransport = new InMemoryTransport(); - - await serverTransport.start(); - await clientTransport.start(); - - void serverTransport.output.pipeTo(clientTransport.input); - void clientTransport.output.pipeTo(serverTransport.input); - - const session = new Session({ - apiBaseUrl: defaultTestConfig.apiBaseUrl, - }); - - const telemetry = Telemetry.create(session, defaultTestConfig); - - mcpClient = new Client( - { - name: "tool-discovery-client", - version: "0.0.0", - }, - { - capabilities: {}, - } - ); - - mcpServer = new Server({ - session, - userConfig: defaultTestConfig, - telemetry, - mcpServer: new McpServer({ - name: "test-server", - version: "5.2.3", - }), - }); - - await mcpServer.connect(serverTransport); - await mcpClient.connect(clientTransport); - - return (await mcpClient.listTools()).tools; - } finally { - await mcpClient?.close(); - await mcpServer?.session?.close(); - await mcpServer?.close(); - } -} diff --git a/tests/accuracy/test-data-dumps/comics.books.json b/tests/accuracy/test-data-dumps/comics.books.json new file mode 100644 index 00000000..3bcb9ecc --- /dev/null +++ b/tests/accuracy/test-data-dumps/comics.books.json @@ -0,0 +1,608 @@ +[ + { + "_id": "fa53ead3-36f3-414c-9b3a-53aa9cf5038a", + "title": "Configurable dedicated project", + "publisher": "Dark Horse Comics", + "release_date": "2007-03-02T00:00:00", + "issues": 118, + "main_characters": [ + "Stephen Shaw" + ], + "genre": [ + "Sci-Fi" + ] + }, + { + "_id": "b2e993fb-2688-4ab0-9512-f8ada5faa948", + "title": "Focused intangible service-desk", + "publisher": "Image Comics", + "release_date": "1998-12-07T00:00:00", + "issues": 137, + "main_characters": [ + "Margaret Hogan" + ], + "genre": [ + "Adventure", + "Horror" + ] + }, + { + "_id": "f674a05a-12c8-4344-875c-6cd1fcba8f9d", + "title": "Expanded secondary system engine", + "publisher": "DC Comics", + "release_date": "2012-12-01T00:00:00", + "issues": 227, + "main_characters": [ + "Joseph Cook", + "Tammy Bishop" + ], + "genre": [ + "Superhero" + ] + }, + { + "_id": "bb72b493-2a61-41d7-9406-dfaf6e51a425", + "title": "Customizable zero-defect Graphic Interface", + "publisher": "DC Comics", + "release_date": "2011-02-24T00:00:00", + "issues": 270, + "main_characters": [ + "Sandra Moss" + ], + "genre": [ + "Fantasy" + ] + }, + { + "_id": "ea85131f-dfc8-4997-b3b0-996138185d73", + "title": "Reduced eco-centric help-desk", + "publisher": "Dark Horse Comics", + "release_date": "2021-03-12T00:00:00", + "issues": 202, + "main_characters": [ + "Margaret Hogan", + "Angelica Stein", + "Tammy Murphy", + "Larry Hensley" + ], + "genre": [ + "Adventure", + "Horror" + ] + }, + { + "_id": "fdd56270-eb31-4456-8bf4-df81371eb290", + "title": "Triple-buffered dedicated help-desk", + "publisher": "Image Comics", + "release_date": "1964-09-20T00:00:00", + "issues": 36, + "main_characters": [ + "Richard Cooper", + "James Sanchez", + "Micheal Brown", + "Jeremy Rice" + ], + "genre": [ + "Fantasy", + "Action" + ] + }, + { + "_id": "6de66ba4-3975-4055-824c-cda5caf517d2", + "title": "Operative logistical secured line", + "publisher": "Marvel Comics", + "release_date": "2007-11-19T00:00:00", + "issues": 55, + "main_characters": [ + "Joseph Bowman", + "Robert Logan", + "Ashley Watkins" + ], + "genre": [ + "Sci-Fi", + "Horror" + ] + }, + { + "_id": "e3cafdbf-e97a-47c9-a848-bdd82e12f8f7", + "title": "Multi-lateral multi-state framework", + "publisher": "IDW Publishing", + "release_date": "2011-09-14T00:00:00", + "issues": 250, + "main_characters": [ + "Ashley Watkins", + "Virginia Watts", + "Lindsay Anderson", + "Scott Garcia" + ], + "genre": [ + "Action", + "Horror" + ] + }, + { + "_id": "547190cd-5c9e-44c5-b8f9-afeefd039001", + "title": "Re-engineered encompassing standardization", + "publisher": "Marvel Comics", + "release_date": "1987-04-16T00:00:00", + "issues": 235, + "main_characters": [ + "Julie Goodwin" + ], + "genre": [ + "Sci-Fi" + ] + }, + { + "_id": "ba3d82f7-8edc-408c-8212-c0d6634624ee", + "title": "Fully-configurable local success", + "publisher": "Dark Horse Comics", + "release_date": "1979-09-13T00:00:00", + "issues": 239, + "main_characters": [ + "Chad Pham", + "Lindsay Anderson", + "Carlos Burton" + ], + "genre": [ + "Adventure" + ] + }, + { + "_id": "a6bc8677-22ab-415a-bfe2-731a9f887cb9", + "title": "Realigned zero-defect capability", + "publisher": "Marvel Comics", + "release_date": "2023-10-01T00:00:00", + "issues": 163, + "main_characters": [ + "Kevin Humphrey", + "Maria Wright", + "Virginia Watts" + ], + "genre": [ + "Fantasy", + "Action" + ] + }, + { + "_id": "fb986790-df22-4db4-8168-c76e9e9471f8", + "title": "Sharable bottom-line frame", + "publisher": "IDW Publishing", + "release_date": "2016-09-28T00:00:00", + "issues": 14, + "main_characters": [ + "Brian Vincent" + ], + "genre": [ + "Sci-Fi", + "Fantasy" + ] + }, + { + "_id": "700aa115-dc5a-4be6-b275-bfb943c95ee0", + "title": "Centralized next generation middleware", + "publisher": "Image Comics", + "release_date": "1970-04-16T00:00:00", + "issues": 5, + "main_characters": [ + "Joseph Cook" + ], + "genre": [ + "Fantasy" + ] + }, + { + "_id": "7959187e-9693-43a1-ae2d-c168431fceb2", + "title": "Re-engineered heuristic array", + "publisher": "IDW Publishing", + "release_date": "2019-02-15T00:00:00", + "issues": 121, + "main_characters": [ + "Angelica Stein", + "Benjamin Morris", + "Jeremy Rice" + ], + "genre": [ + "Fantasy", + "Action" + ] + }, + { + "_id": "d6018445-5149-42e7-9d87-eb1b181ce20c", + "title": "Programmable transitional collaboration", + "publisher": "DC Comics", + "release_date": "1999-08-10T00:00:00", + "issues": 235, + "main_characters": [ + "Joseph Cook", + "Cynthia Brown", + "Carlos Burton", + "Micheal Brown" + ], + "genre": [ + "Adventure" + ] + }, + { + "_id": "055507ff-7a48-4df8-9ba9-7b6c10e11836", + "title": "Object-based dynamic knowledgebase", + "publisher": "Image Comics", + "release_date": "1993-02-24T00:00:00", + "issues": 189, + "main_characters": [ + "Cristian Oneal", + "Brian Vincent", + "Holly Green", + "James Sanchez" + ], + "genre": [ + "Sci-Fi", + "Fantasy" + ] + }, + { + "_id": "1add2da3-68e6-48a3-9703-b593c9e0bf2e", + "title": "Enhanced asynchronous matrices", + "publisher": "DC Comics", + "release_date": "2001-03-01T00:00:00", + "issues": 176, + "main_characters": [ + "Justin Martinez", + "Tammy Murphy" + ], + "genre": [ + "Action", + "Fantasy" + ] + }, + { + "_id": "c0fe2869-eb7d-4f09-a773-028387a54969", + "title": "Synergized maximized artificial intelligence", + "publisher": "DC Comics", + "release_date": "1976-09-05T00:00:00", + "issues": 68, + "main_characters": [ + "Christopher Elliott", + "Maria Wright" + ], + "genre": [ + "Superhero", + "Adventure" + ] + }, + { + "_id": "c2fafbf6-5f71-4f31-9775-803e8c77e467", + "title": "Switchable bottom-line complexity", + "publisher": "Marvel Comics", + "release_date": "2012-08-12T00:00:00", + "issues": 156, + "main_characters": [ + "Lindsay Anderson", + "Virginia Watts", + "Robert Logan", + "Margaret Hogan" + ], + "genre": [ + "Adventure" + ] + }, + { + "_id": "f72be3a7-d4be-40a1-ad66-370b44759047", + "title": "Triple-buffered impactful customer loyalty", + "publisher": "Marvel Comics", + "release_date": "1976-09-18T00:00:00", + "issues": 275, + "main_characters": [ + "Sandra Moss", + "Charles Blair", + "Justin Martinez" + ], + "genre": [ + "Fantasy", + "Action" + ] + }, + { + "_id": "da5be16e-13e8-42d5-8954-bd89919395af", + "title": "Programmable 24/7 website", + "publisher": "DC Comics", + "release_date": "2023-11-06T00:00:00", + "issues": 278, + "main_characters": [ + "Luis Callahan", + "Carlos Burton", + "Cristian Oneal", + "Michelle Valdez" + ], + "genre": [ + "Horror", + "Fantasy" + ] + }, + { + "_id": "92afc1e6-f703-4aa7-9866-3b62f2784fec", + "title": "Advanced incremental framework", + "publisher": "Image Comics", + "release_date": "2008-07-21T00:00:00", + "issues": 109, + "main_characters": [ + "Holly Green", + "Diana Mata", + "Julie Goodwin" + ], + "genre": [ + "Horror", + "Sci-Fi" + ] + }, + { + "_id": "fec61fdd-bddb-431a-b14a-d81601a47cf8", + "title": "Front-line coherent system engine", + "publisher": "DC Comics", + "release_date": "2012-04-27T00:00:00", + "issues": 297, + "main_characters": [ + "Joshua Hicks" + ], + "genre": [ + "Action", + "Horror" + ] + }, + { + "_id": "9d37d0d7-1adc-4f54-8790-30f13472520c", + "title": "Progressive systematic superstructure", + "publisher": "Image Comics", + "release_date": "1996-02-20T00:00:00", + "issues": 295, + "main_characters": [ + "Margaret Hogan", + "Christopher Elliott", + "Joseph Cook" + ], + "genre": [ + "Fantasy", + "Adventure" + ] + }, + { + "_id": "338a83ad-06fc-42e1-a605-60a192ce5643", + "title": "Implemented national help-desk", + "publisher": "DC Comics", + "release_date": "2015-05-11T00:00:00", + "issues": 257, + "main_characters": [ + "Lindsay Anderson", + "James Sanchez", + "Julie Goodwin", + "Charles Blair" + ], + "genre": [ + "Action" + ] + }, + { + "_id": "5b07c17b-4df9-4b72-9c3e-b51d93def1fb", + "title": "Down-sized impactful workforce", + "publisher": "IDW Publishing", + "release_date": "2024-06-19T00:00:00", + "issues": 259, + "main_characters": [ + "Debbie Green" + ], + "genre": [ + "Sci-Fi", + "Superhero" + ] + }, + { + "_id": "625b11a5-bb45-4837-9cd6-50bfe2e3396c", + "title": "Re-engineered leadingedge structure", + "publisher": "DC Comics", + "release_date": "2011-04-14T00:00:00", + "issues": 282, + "main_characters": [ + "Larry Hensley", + "Joseph Cook", + "Brian Vincent", + "Sandra Moss" + ], + "genre": [ + "Adventure" + ] + }, + { + "_id": "71b845f3-4416-430a-81eb-8c208f824365", + "title": "Cloned 3rdgeneration contingency", + "publisher": "Dark Horse Comics", + "release_date": "2002-07-11T00:00:00", + "issues": 238, + "main_characters": [ + "Larry Hensley", + "Margaret Hogan", + "Holly Green", + "Joseph Bowman" + ], + "genre": [ + "Superhero", + "Fantasy" + ] + }, + { + "_id": "14dbf3a6-d258-4c96-8883-336b60bc2112", + "title": "Secured zero tolerance monitoring", + "publisher": "DC Comics", + "release_date": "1969-11-30T00:00:00", + "issues": 104, + "main_characters": [ + "Micheal Brown" + ], + "genre": [ + "Horror", + "Superhero" + ] + }, + { + "_id": "091e16d8-d50c-4e7d-9b3a-545cf2596738", + "title": "Automated bifurcated access", + "publisher": "Image Comics", + "release_date": "1990-01-24T00:00:00", + "issues": 74, + "main_characters": [ + "Robert Logan" + ], + "genre": [ + "Sci-Fi" + ] + }, + { + "_id": "c47ec96a-4d6e-43ea-9bb5-00e4c8058b53", + "title": "Universal high-level pricing structure", + "publisher": "DC Comics", + "release_date": "1971-04-21T00:00:00", + "issues": 135, + "main_characters": [ + "Jeremy Rice", + "Elizabeth Robinson", + "James Sanchez" + ], + "genre": [ + "Action", + "Sci-Fi" + ] + }, + { + "_id": "d446a8ca-5d01-4be9-a061-027ef1f7bfc6", + "title": "Reduced optimizing strategy", + "publisher": "Dark Horse Comics", + "release_date": "1984-06-24T00:00:00", + "issues": 111, + "main_characters": [ + "Joshua Hicks", + "Jeremy Rice", + "Micheal Brown" + ], + "genre": [ + "Fantasy", + "Superhero" + ] + }, + { + "_id": "09c734ff-2bf0-4cb6-bd42-4232209c00c9", + "title": "Virtual non-volatile groupware", + "publisher": "DC Comics", + "release_date": "2013-05-22T00:00:00", + "issues": 13, + "main_characters": [ + "Luis Callahan", + "Tammy Bishop", + "Cynthia Brown" + ], + "genre": [ + "Action" + ] + }, + { + "_id": "691034fa-ad52-413e-96a2-a9a319fffe7b", + "title": "Horizontal disintermediate extranet", + "publisher": "DC Comics", + "release_date": "2021-12-03T00:00:00", + "issues": 129, + "main_characters": [ + "Margaret Hogan" + ], + "genre": [ + "Action" + ] + }, + { + "_id": "07942b5a-f7c4-4fc1-bdeb-7eb46b0d57f8", + "title": "Cross-platform discrete framework", + "publisher": "Dark Horse Comics", + "release_date": "2001-08-02T00:00:00", + "issues": 38, + "main_characters": [ + "James Sanchez", + "Larry Hensley" + ], + "genre": [ + "Superhero" + ] + }, + { + "_id": "05d637ed-3942-4276-a885-7b3363dd48e2", + "title": "Cross-platform regional info-mediaries", + "publisher": "Image Comics", + "release_date": "2005-03-30T00:00:00", + "issues": 150, + "main_characters": [ + "Carlos Burton" + ], + "genre": [ + "Superhero", + "Fantasy" + ] + }, + { + "_id": "88904f06-50a6-44f1-bccc-f379a9788611", + "title": "Mandatory 6thgeneration secured line", + "publisher": "Image Comics", + "release_date": "2021-06-27T00:00:00", + "issues": 262, + "main_characters": [ + "Luis Callahan" + ], + "genre": [ + "Sci-Fi", + "Superhero" + ] + }, + { + "_id": "fc961fd6-2ec6-43e5-beae-7f58a6c25d9c", + "title": "Exclusive interactive concept", + "publisher": "IDW Publishing", + "release_date": "1969-06-03T00:00:00", + "issues": 264, + "main_characters": [ + "Scott Garcia", + "Joseph Bowman" + ], + "genre": [ + "Fantasy", + "Superhero" + ] + }, + { + "_id": "481a3ea6-9629-4fe6-8a5a-eba846f0e62c", + "title": "Focused intermediate methodology", + "publisher": "DC Comics", + "release_date": "2004-03-19T00:00:00", + "issues": 210, + "main_characters": [ + "Justin Martinez", + "Julie Goodwin", + "Benjamin Morris", + "Virginia Watts" + ], + "genre": [ + "Adventure", + "Action" + ] + }, + { + "_id": "6bab6bcd-2f6b-4dfb-a030-d63b32fc6250", + "title": "Right-sized contextually-based toolset", + "publisher": "IDW Publishing", + "release_date": "2007-12-27T00:00:00", + "issues": 117, + "main_characters": [ + "Debbie Green", + "Christopher Elliott", + "Joshua Hicks" + ], + "genre": [ + "Sci-Fi", + "Action" + ] + } +] \ No newline at end of file diff --git a/tests/accuracy/test-data-dumps/comics.characters.json b/tests/accuracy/test-data-dumps/comics.characters.json new file mode 100644 index 00000000..944c33d5 --- /dev/null +++ b/tests/accuracy/test-data-dumps/comics.characters.json @@ -0,0 +1,576 @@ +[ + { + "_id": "d7047787-abea-40fa-b78e-939925fd3589", + "name": "Elizabeth Robinson", + "alias": "ashley62", + "powers": [ + "Shapeshifting", + "Telepathy", + "Flight" + ], + "first_appearance": "1961-06-23T00:00:00", + "affiliations": [ + "Fantastic Four", + "X-Men" + ], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "06ac8173-51a6-404c-8f9a-628de889b1de", + "name": "Joshua Wang", + "alias": "paulasmith", + "powers": [ + "Telekinesis" + ], + "first_appearance": "1987-04-16T00:00:00", + "affiliations": [ + "Fantastic Four", + "Justice League" + ], + "origin": "Earth", + "is_villain": true + }, + { + "_id": "252c203a-0271-4ee7-a3d9-34c9f922b959", + "name": "Stephen Shaw", + "alias": "adamskenneth", + "powers": [ + "Super Speed", + "Flight" + ], + "first_appearance": "2004-07-26T00:00:00", + "affiliations": [], + "origin": "Atlantis", + "is_villain": true + }, + { + "_id": "bf5b7d04-fe71-4969-84a3-0eb9ed5d2197", + "name": "Joseph Bowman", + "alias": "amysalazar", + "powers": [ + "Time Manipulation" + ], + "first_appearance": "1961-07-03T00:00:00", + "affiliations": [ + "Teen Titans", + "Avengers" + ], + "origin": "Atlantis", + "is_villain": true + }, + { + "_id": "c6271161-bd78-4338-b6ca-88d91f7b853e", + "name": "Debbie Green", + "alias": "steventodd", + "powers": [ + "Energy Blasts", + "Regeneration" + ], + "first_appearance": "2021-12-05T00:00:00", + "affiliations": [], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "60223f4c-5908-4f82-a2a3-a5dad1771f7f", + "name": "Christopher Elliott", + "alias": "barajasmitchell", + "powers": [ + "Flight", + "Invisibility", + "Telekinesis" + ], + "first_appearance": "1947-03-23T00:00:00", + "affiliations": [], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "f66a8f7a-9ca3-431a-9ece-aba96be18220", + "name": "Tammy Murphy", + "alias": "jessicagill", + "powers": [ + "Super Strength", + "Telekinesis" + ], + "first_appearance": "2000-07-06T00:00:00", + "affiliations": [], + "origin": "Mutant", + "is_villain": false + }, + { + "_id": "817c0b11-3eac-4a3a-b55f-203126db060f", + "name": "Scott Garcia", + "alias": "whitechristie", + "powers": [ + "Telepathy", + "Energy Blasts" + ], + "first_appearance": "2000-11-22T00:00:00", + "affiliations": [], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "1ee6789f-d774-43b8-87e2-9f6dbac6230a", + "name": "Julie Goodwin", + "alias": "robertsmith", + "powers": [ + "Telepathy", + "Super Speed" + ], + "first_appearance": "1953-08-09T00:00:00", + "affiliations": [ + "Teen Titans" + ], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "3ab9b55d-94ab-449e-bda9-63b2c633494a", + "name": "Joshua Hicks", + "alias": "cynthia32", + "powers": [ + "Super Strength", + "Invisibility", + "Telekinesis" + ], + "first_appearance": "1967-07-17T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "51adf385-1f8e-4290-bcc6-ce2808dc461e", + "name": "Justin Martinez", + "alias": "janicebrown", + "powers": [ + "Super Speed", + "Super Strength" + ], + "first_appearance": "1973-09-19T00:00:00", + "affiliations": [ + "Avengers" + ], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "3a3d934e-f5bb-4238-b8a5-74669a937a14", + "name": "Holly Green", + "alias": "ystanley", + "powers": [ + "Shapeshifting", + "Energy Blasts" + ], + "first_appearance": "2013-08-05T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": true + }, + { + "_id": "f044b9fb-82c6-48b3-b8b2-806b0be66466", + "name": "Margaret Hogan", + "alias": "wendyconway", + "powers": [ + "Super Speed", + "Telepathy" + ], + "first_appearance": "1944-08-13T00:00:00", + "affiliations": [ + "Justice League", + "X-Men" + ], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "fd50880a-9d0e-43e1-8b20-2830eba8c7dc", + "name": "Ashley Watkins", + "alias": "cjohnson", + "powers": [ + "Shapeshifting" + ], + "first_appearance": "1940-09-13T00:00:00", + "affiliations": [ + "Fantastic Four", + "Guardians of the Galaxy" + ], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "68036d6b-1780-4352-98ea-2c68cb5c7bff", + "name": "Tammy Bishop", + "alias": "geoffreyryan", + "powers": [ + "Regeneration" + ], + "first_appearance": "1984-11-04T00:00:00", + "affiliations": [ + "Fantastic Four", + "X-Men" + ], + "origin": "Earth", + "is_villain": true + }, + { + "_id": "dbfa84f2-e598-4e67-99a9-5e8c34e5606f", + "name": "Michelle Valdez", + "alias": "manuelcobb", + "powers": [ + "Regeneration", + "Energy Blasts" + ], + "first_appearance": "2014-08-04T00:00:00", + "affiliations": [ + "Teen Titans" + ], + "origin": "Mutant", + "is_villain": false + }, + { + "_id": "ae85885c-13d0-4ae2-b82c-fa53859665d7", + "name": "Joseph Cook", + "alias": "scott40", + "powers": [ + "Telepathy", + "Telekinesis" + ], + "first_appearance": "1976-04-01T00:00:00", + "affiliations": [], + "origin": "Earth", + "is_villain": true + }, + { + "_id": "0738b98f-4699-4609-9156-fb6a1085a503", + "name": "Jeremy Rice", + "alias": "james82", + "powers": [ + "Invisibility" + ], + "first_appearance": "1977-09-22T00:00:00", + "affiliations": [], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "a072c5df-cc65-4044-ba24-fcc8eaa71b4a", + "name": "Chad Pham", + "alias": "smithjennifer", + "powers": [ + "Telepathy" + ], + "first_appearance": "2001-05-26T00:00:00", + "affiliations": [ + "Teen Titans" + ], + "origin": "Mars", + "is_villain": false + }, + { + "_id": "d545ec48-680c-4493-8650-d759bedabb7e", + "name": "Diana Mata", + "alias": "zwilliamson", + "powers": [ + "Super Speed", + "Energy Blasts", + "Invisibility" + ], + "first_appearance": "2010-11-21T00:00:00", + "affiliations": [], + "origin": "Mars", + "is_villain": false + }, + { + "_id": "e6bfb576-d65c-40f8-a547-90719578e03c", + "name": "Maria Wright", + "alias": "yraymond", + "powers": [ + "Flight", + "Telepathy" + ], + "first_appearance": "1971-04-15T00:00:00", + "affiliations": [ + "Avengers", + "Teen Titans" + ], + "origin": "Asgard", + "is_villain": true + }, + { + "_id": "a2e7b056-0c79-4a2e-83ff-1774b6e186ea", + "name": "Carlos Burton", + "alias": "rperkins", + "powers": [ + "Super Speed", + "Time Manipulation", + "Telekinesis" + ], + "first_appearance": "1970-01-20T00:00:00", + "affiliations": [ + "Teen Titans" + ], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "ec7f8d60-3fef-4329-a7d2-6d89805d758c", + "name": "Lindsay Anderson", + "alias": "amycox", + "powers": [ + "Super Strength", + "Telekinesis" + ], + "first_appearance": "1976-04-30T00:00:00", + "affiliations": [], + "origin": "Atlantis", + "is_villain": false + }, + { + "_id": "cdc66356-a438-4989-b4d1-315609ec6d91", + "name": "Larry Hensley", + "alias": "ylester", + "powers": [ + "Super Strength", + "Invisibility", + "Shapeshifting" + ], + "first_appearance": "2019-01-21T00:00:00", + "affiliations": [ + "Guardians of the Galaxy", + "Avengers" + ], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "0952b684-f887-446f-afcb-71d2ace3fd32", + "name": "Sandra Moss", + "alias": "alexandra81", + "powers": [ + "Telekinesis", + "Super Speed" + ], + "first_appearance": "1989-07-28T00:00:00", + "affiliations": [], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "9a63c787-3b44-46c2-b927-ffdde6ee10bc", + "name": "Cynthia Brown", + "alias": "freed", + "powers": [ + "Super Strength", + "Energy Blasts" + ], + "first_appearance": "2015-06-19T00:00:00", + "affiliations": [ + "Fantastic Four" + ], + "origin": "Mars", + "is_villain": false + }, + { + "_id": "2b058c3e-e795-4ecd-b5d7-dba6f1a831f6", + "name": "Brian Vincent", + "alias": "ghowell", + "powers": [ + "Invisibility", + "Flight", + "Super Speed" + ], + "first_appearance": "2012-05-12T00:00:00", + "affiliations": [], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "7a1e38ae-0bc6-41dd-ad61-e7542e6e9d4f", + "name": "Kevin Humphrey", + "alias": "mary44", + "powers": [ + "Super Strength", + "Super Speed", + "Telepathy" + ], + "first_appearance": "1993-05-10T00:00:00", + "affiliations": [ + "Justice League", + "Teen Titans" + ], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "c147036a-ab66-4023-a950-1fb81acf7dca", + "name": "Luis Callahan", + "alias": "ashleyreeves", + "powers": [ + "Telekinesis" + ], + "first_appearance": "1943-11-02T00:00:00", + "affiliations": [ + "X-Men" + ], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "c42cec2b-156d-481e-993b-aa93637ae76e", + "name": "Micheal Brown", + "alias": "lisa85", + "powers": [ + "Telepathy", + "Flight", + "Time Manipulation" + ], + "first_appearance": "1983-11-04T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "5bd85192-926b-42f3-bc18-afd40a53753e", + "name": "James Sanchez", + "alias": "mary95", + "powers": [ + "Energy Blasts", + "Telekinesis" + ], + "first_appearance": "1999-05-20T00:00:00", + "affiliations": [ + "Justice League" + ], + "origin": "Atlantis", + "is_villain": false + }, + { + "_id": "4b41e8f8-2cea-4d50-b7b0-ec59fca45367", + "name": "Richard Cooper", + "alias": "james85", + "powers": [ + "Telekinesis", + "Energy Blasts", + "Super Speed" + ], + "first_appearance": "2021-11-27T00:00:00", + "affiliations": [ + "Justice League", + "Fantastic Four" + ], + "origin": "Mars", + "is_villain": true + }, + { + "_id": "8fd8c7b5-fabd-4021-9aeb-114e64ad06e0", + "name": "Charles Blair", + "alias": "barbara60", + "powers": [ + "Super Strength" + ], + "first_appearance": "2012-05-03T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "830eaa54-4397-4344-8964-2abdd7e2d86d", + "name": "Virginia Watts", + "alias": "klane", + "powers": [ + "Telekinesis" + ], + "first_appearance": "2016-04-27T00:00:00", + "affiliations": [], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "495f64a9-123e-46d4-9ddb-21692353a849", + "name": "Robert Logan", + "alias": "griffinsean", + "powers": [ + "Telepathy" + ], + "first_appearance": "2003-07-16T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "e3a96aac-bd9f-49f0-a9ea-efa7d6baf3e9", + "name": "Cheyenne Powell", + "alias": "laurenolsen", + "powers": [ + "Time Manipulation", + "Energy Blasts" + ], + "first_appearance": "1964-02-05T00:00:00", + "affiliations": [], + "origin": "Atlantis", + "is_villain": false + }, + { + "_id": "2688321c-f5b0-43c8-b95c-060e748ba73b", + "name": "Benjamin Morris", + "alias": "sierra18", + "powers": [ + "Telekinesis", + "Regeneration", + "Shapeshifting" + ], + "first_appearance": "1964-09-27T00:00:00", + "affiliations": [ + "X-Men", + "Avengers" + ], + "origin": "Mars", + "is_villain": false + }, + { + "_id": "98c4ca66-c7a7-44ad-ad16-5395905a011e", + "name": "Cristian Oneal", + "alias": "harrellamy", + "powers": [ + "Super Speed" + ], + "first_appearance": "1965-01-29T00:00:00", + "affiliations": [], + "origin": "Mutant", + "is_villain": false + }, + { + "_id": "e2999d26-1a93-4355-b04f-44f27a3c7f36", + "name": "Jessica Vargas", + "alias": "chadherrera", + "powers": [ + "Energy Blasts", + "Super Strength", + "Telekinesis" + ], + "first_appearance": "1974-03-29T00:00:00", + "affiliations": [ + "X-Men", + "Teen Titans" + ], + "origin": "Earth", + "is_villain": true + }, + { + "_id": "f3fa712d-2124-433a-b405-c02757fa1503", + "name": "Angelica Stein", + "alias": "reedjason", + "powers": [ + "Invisibility" + ], + "first_appearance": "1981-01-02T00:00:00", + "affiliations": [ + "Avengers" + ], + "origin": "Earth", + "is_villain": true + } +] \ No newline at end of file diff --git a/tests/accuracy/test-data-dumps/mflix.movies.json b/tests/accuracy/test-data-dumps/mflix.movies.json new file mode 100644 index 00000000..cd35382e --- /dev/null +++ b/tests/accuracy/test-data-dumps/mflix.movies.json @@ -0,0 +1,687 @@ +[ + { + "_id": "bf96c9f7-17be-467c-9f5e-3f19dc2e9ed4", + "title": "Human sell", + "release_year": 1993, + "genres": [ + "Sci-Fi" + ], + "director": "Christina Collins", + "cast": [ + "Jeremy Marks", + "Matthew Moore", + "Erica Miller", + "Beth Morales" + ], + "runtime": 139, + "rating": 9.3 + }, + { + "_id": "ab338dcb-c541-4d39-ba3d-58e4ebcac16c", + "title": "Trial we much", + "release_year": 2020, + "genres": [ + "Horror", + "Comedy" + ], + "director": "Steven Miles", + "cast": [ + "Patrick Huynh", + "Darrell Thompson", + "Lindsay Thompson", + "Brandi Cooper" + ], + "runtime": 149, + "rating": 5.0 + }, + { + "_id": "2bd3ed9f-cbeb-4c44-bec7-01d51c3dd7db", + "title": "Someone", + "release_year": 1996, + "genres": [ + "Action", + "Horror" + ], + "director": "Steven Miles", + "cast": [ + "Carrie Cummings", + "Patricia Rice", + "Suzanne Collins", + "April Murray", + "Kimberly Shaw" + ], + "runtime": 153, + "rating": 2.6 + }, + { + "_id": "fb35d6f3-bda5-450f-8873-56e035e76c42", + "title": "Without our", + "release_year": 2012, + "genres": [ + "Comedy" + ], + "director": "Christina Collins", + "cast": [ + "Rodney Gray", + "Mr. Joseph Allen", + "Heather Robles", + "Eric Edwards", + "James Wilson" + ], + "runtime": 143, + "rating": 9.1 + }, + { + "_id": "4b0d5f7a-c551-4995-aece-a5a585d238a7", + "title": "Cost anything", + "release_year": 2002, + "genres": [ + "Romance", + "Action" + ], + "director": "Bryan Andrews", + "cast": [ + "Gregory Mullins", + "Jillian Arroyo", + "Angela Reed" + ], + "runtime": 112, + "rating": 3.8 + }, + { + "_id": "797e4ee5-eff4-45f4-a0d7-40f62f7bd138", + "title": "Hold green energy their", + "release_year": 1989, + "genres": [ + "Horror" + ], + "director": "Christina Collins", + "cast": [ + "Eduardo Carey", + "Jodi Miller", + "Ronald Johnson", + "Lindsay Hernandez" + ], + "runtime": 126, + "rating": 7.4 + }, + { + "_id": "1b81c45b-1d09-47dc-871f-ace109107446", + "title": "Choose ability start", + "release_year": 1990, + "genres": [ + "Drama", + "Comedy" + ], + "director": "Bryan Andrews", + "cast": [ + "Tyler Daniels", + "Gregory Harris", + "Whitney Swanson", + "Pamela Ramirez" + ], + "runtime": 141, + "rating": 5.6 + }, + { + "_id": "400a08be-f07b-416a-8cdc-46c9886b812b", + "title": "Cover perhaps", + "release_year": 2022, + "genres": [ + "Drama" + ], + "director": "Daniel Wallace", + "cast": [ + "Victoria Price", + "Holly Ross", + "Michele Jones" + ], + "runtime": 173, + "rating": 4.3 + }, + { + "_id": "4d4b5420-83e1-4ecd-9c86-238394a1fd0f", + "title": "Policy particularly", + "release_year": 2003, + "genres": [ + "Comedy" + ], + "director": "Brittany Parker", + "cast": [ + "Emily Haynes", + "Crystal Johnson", + "Ernest Jones" + ], + "runtime": 154, + "rating": 6.6 + }, + { + "_id": "9a489559-ab9d-4dbb-b3e7-d65895b27704", + "title": "Store care", + "release_year": 2017, + "genres": [ + "Romance", + "Sci-Fi" + ], + "director": "Sara Stewart", + "cast": [ + "Katherine Matthews", + "Stacey Wolf", + "Laurie Blackwell", + "Luis Ortiz", + "Christopher Vasquez" + ], + "runtime": 168, + "rating": 7.7 + }, + { + "_id": "99e75e60-6466-4314-92c3-00c433a06600", + "title": "Section close bad", + "release_year": 2024, + "genres": [ + "Drama", + "Comedy" + ], + "director": "Bryan Andrews", + "cast": [ + "Heather Marshall", + "Alexander Austin", + "Stephanie Villarreal MD", + "Ryan Marquez" + ], + "runtime": 180, + "rating": 7.7 + }, + { + "_id": "726d0c12-4bab-4684-b8e4-5ba795c88273", + "title": "Become stand", + "release_year": 2001, + "genres": [ + "Sci-Fi", + "Thriller" + ], + "director": "Brian Martinez", + "cast": [ + "Robert Ross", + "Kimberly Williamson", + "Pam Wyatt" + ], + "runtime": 162, + "rating": 1.5 + }, + { + "_id": "aad23b4b-ddb9-48bd-9b48-b63da1874bb0", + "title": "I case", + "release_year": 2012, + "genres": [ + "Drama", + "Comedy" + ], + "director": "Brittany Parker", + "cast": [ + "Justin Davis", + "Karen Doyle", + "Daniel Jackson", + "Courtney Mcdonald" + ], + "runtime": 122, + "rating": 3.1 + }, + { + "_id": "0d1ce099-18f1-4608-9c5b-5eb8b5870760", + "title": "No organization style", + "release_year": 2013, + "genres": [ + "Comedy" + ], + "director": "Christina Collins", + "cast": [ + "Benjamin Whitney", + "Joseph Bush", + "Barbara Griffin" + ], + "runtime": 167, + "rating": 9.6 + }, + { + "_id": "15855c7b-ece2-4238-b995-57f6207509ea", + "title": "Computer garden", + "release_year": 2012, + "genres": [ + "Horror" + ], + "director": "Steven Miles", + "cast": [ + "Darlene Lee", + "Tina Wang", + "Nathan Mayo" + ], + "runtime": 146, + "rating": 6.5 + }, + { + "_id": "e8a6ff98-1e7e-4481-a467-39ebbfc79f67", + "title": "Trip information feel", + "release_year": 2008, + "genres": [ + "Action", + "Thriller" + ], + "director": "Brittany Parker", + "cast": [ + "Kelly Walsh", + "Michael Rocha" + ], + "runtime": 148, + "rating": 9.8 + }, + { + "_id": "ef95e7a5-7f73-462e-bd03-c924a8876a7b", + "title": "It project low part", + "release_year": 1992, + "genres": [ + "Horror" + ], + "director": "Christina Collins", + "cast": [ + "Sheena Murphy", + "Amanda Miller", + "Erica Curtis", + "Roger Jones", + "Andrew Simpson" + ], + "runtime": 161, + "rating": 2.4 + }, + { + "_id": "efd2f4f4-1004-4b4e-8bc9-390466a6f77a", + "title": "Near attorney discuss", + "release_year": 1983, + "genres": [ + "Comedy" + ], + "director": "Christina Collins", + "cast": [ + "Chase Myers", + "Benjamin Kelly", + "Thomas Summers MD", + "Jessica Woods" + ], + "runtime": 174, + "rating": 9.5 + }, + { + "_id": "07f2cb6e-819e-4ff4-b3ba-134d3d9af549", + "title": "Whether know", + "release_year": 2009, + "genres": [ + "Comedy", + "Thriller" + ], + "director": "Bryan Andrews", + "cast": [ + "Amy Reed", + "William Williams", + "Steven Lawrence" + ], + "runtime": 134, + "rating": 9.6 + }, + { + "_id": "ab5948c9-088b-42d6-89d9-42c4603c8b19", + "title": "Against place", + "release_year": 2017, + "genres": [ + "Drama", + "Romance" + ], + "director": "Daniel Wallace", + "cast": [ + "Brittany Thompson", + "Clinton Bishop", + "Terri Meyer", + "Stacey Phillips", + "Alexander Hunt" + ], + "runtime": 152, + "rating": 5.0 + }, + { + "_id": "ef7f63fa-b25f-4aea-98e2-d7bdecc26ef5", + "title": "Return yard", + "release_year": 1994, + "genres": [ + "Horror" + ], + "director": "Christina Collins", + "cast": [ + "Mason Lara", + "Taylor Salinas", + "Tim Foster", + "Erin Sharp" + ], + "runtime": 99, + "rating": 8.8 + }, + { + "_id": "b532e3c8-6292-4f9d-879f-1f070b1a6992", + "title": "Certain fish", + "release_year": 2009, + "genres": [ + "Romance" + ], + "director": "Steven Miles", + "cast": [ + "Jonathan King", + "Caitlyn Costa DDS", + "Steve Davis", + "Perry Anderson" + ], + "runtime": 130, + "rating": 8.6 + }, + { + "_id": "c95e74b0-e47e-4d10-b847-8caa20b94b32", + "title": "Agreement like program", + "release_year": 2004, + "genres": [ + "Sci-Fi" + ], + "director": "Daniel Jackson", + "cast": [ + "Ashley Green", + "Rebecca Osborne", + "Robert Williams", + "Breanna Dunn", + "Philip Vargas" + ], + "runtime": 110, + "rating": 8.1 + }, + { + "_id": "791688be-4358-45ab-956e-71fe3fd35d19", + "title": "Floor seven then", + "release_year": 2009, + "genres": [ + "Horror" + ], + "director": "Daniel Wallace", + "cast": [ + "Dustin Wright", + "Crystal Young" + ], + "runtime": 143, + "rating": 4.8 + }, + { + "_id": "488fd79d-dde6-4462-9b90-339d1f3d7474", + "title": "Like rather paper", + "release_year": 2006, + "genres": [ + "Drama" + ], + "director": "Spencer Gillespie", + "cast": [ + "Sean Moyer", + "James Edwards", + "Tara Lee", + "Robert Scott" + ], + "runtime": 175, + "rating": 9.1 + }, + { + "_id": "3da68e4d-ef14-4fab-9243-19075262e5ca", + "title": "Argue hospital", + "release_year": 1994, + "genres": [ + "Romance", + "Sci-Fi" + ], + "director": "Amanda Young", + "cast": [ + "Carolyn Williams", + "Jasmin Sampson", + "Phillip Levy", + "Brenda Clark", + "Lauren Perry" + ], + "runtime": 149, + "rating": 9.5 + }, + { + "_id": "f5206a16-4dca-4c1e-b3aa-0d09f2082601", + "title": "Become after card", + "release_year": 1986, + "genres": [ + "Sci-Fi", + "Horror" + ], + "director": "Brian Martinez", + "cast": [ + "Rhonda Ochoa", + "Charlene Castillo" + ], + "runtime": 100, + "rating": 8.5 + }, + { + "_id": "fbf30e42-ae6d-4775-bb3e-c5c127ddea06", + "title": "Born authority attention", + "release_year": 1994, + "genres": [ + "Romance" + ], + "director": "Brian Martinez", + "cast": [ + "Matthew Thomas", + "Carly Perkins" + ], + "runtime": 131, + "rating": 4.9 + }, + { + "_id": "4b85a220-8a09-46a7-bea3-a2dad8130311", + "title": "Local seven media", + "release_year": 1998, + "genres": [ + "Sci-Fi", + "Drama" + ], + "director": "Amanda Young", + "cast": [ + "Jessica Perez", + "Larry Atkinson" + ], + "runtime": 95, + "rating": 2.0 + }, + { + "_id": "498597d2-3254-46ef-a800-f322a86fbd55", + "title": "Keep employee", + "release_year": 1981, + "genres": [ + "Horror" + ], + "director": "Christina Collins", + "cast": [ + "Alexis Carlson", + "Andrew Stewart" + ], + "runtime": 161, + "rating": 6.0 + }, + { + "_id": "788d9343-6908-4762-88ee-b04aba1e58b5", + "title": "American question generation", + "release_year": 1986, + "genres": [ + "Romance" + ], + "director": "Daniel Jackson", + "cast": [ + "Troy Carter", + "Peter Hernandez", + "Christine Brown" + ], + "runtime": 176, + "rating": 8.0 + }, + { + "_id": "74bcf255-df91-40c0-85c0-d7b85ff84f9a", + "title": "Maintain out", + "release_year": 2000, + "genres": [ + "Sci-Fi", + "Action" + ], + "director": "Brian Martinez", + "cast": [ + "Nancy Evans", + "Michael Gill", + "Justin Carroll" + ], + "runtime": 179, + "rating": 10.0 + }, + { + "_id": "61ddf1d4-17b7-4c63-9bf4-5315e740dc7f", + "title": "Ten box study", + "release_year": 2011, + "genres": [ + "Horror", + "Romance" + ], + "director": "Steven Miles", + "cast": [ + "Mark Hicks", + "Michelle Dean", + "John Buchanan", + "Veronica Johnson" + ], + "runtime": 147, + "rating": 2.5 + }, + { + "_id": "ab7d8067-f0ff-4955-bc0c-baca4e56e9a4", + "title": "Production operation", + "release_year": 2014, + "genres": [ + "Horror", + "Romance" + ], + "director": "Sara Stewart", + "cast": [ + "Ashley Mata", + "Mark Kelly", + "John West", + "Harold Day" + ], + "runtime": 125, + "rating": 4.1 + }, + { + "_id": "ccd27288-a496-447d-b01c-1f0b42edcc92", + "title": "What language", + "release_year": 2004, + "genres": [ + "Sci-Fi" + ], + "director": "Sara Stewart", + "cast": [ + "Scott Mckenzie", + "Jason Lee", + "Nathan Gardner", + "Jamie Greene", + "Angela Garner" + ], + "runtime": 177, + "rating": 3.7 + }, + { + "_id": "b32dd176-938b-4ded-823a-311423fdc2ea", + "title": "Up usually central", + "release_year": 2011, + "genres": [ + "Sci-Fi", + "Comedy" + ], + "director": "Daniel Jackson", + "cast": [ + "Jennifer Carlson", + "Jonathan Stewart DDS", + "Amy Lester" + ], + "runtime": 159, + "rating": 5.6 + }, + { + "_id": "4aa5f384-3a05-49ff-aa9d-a0e4256c422f", + "title": "For boy only", + "release_year": 1987, + "genres": [ + "Thriller", + "Action" + ], + "director": "Sara Stewart", + "cast": [ + "Gene Smith", + "Robert Osborne Jr.", + "Laura Fox", + "Alexis Lowe" + ], + "runtime": 95, + "rating": 3.6 + }, + { + "_id": "1c858ca4-d6e9-435c-8e25-d8b05a4e825c", + "title": "Site win including your", + "release_year": 2008, + "genres": [ + "Sci-Fi" + ], + "director": "Spencer Gillespie", + "cast": [ + "John Williams", + "Jason Huang", + "Karen Klein", + "Gary Tran", + "Jessica Murphy" + ], + "runtime": 178, + "rating": 6.2 + }, + { + "_id": "bc5e5766-e998-4ec2-a40c-62ce5d39b972", + "title": "Sell huge hair", + "release_year": 1997, + "genres": [ + "Thriller", + "Action" + ], + "director": "Bryan Andrews", + "cast": [ + "Thomas Johnson", + "Ryan Morrow" + ], + "runtime": 157, + "rating": 4.4 + }, + { + "_id": "090215c8-29e8-4d38-ae9b-ceb78408b982", + "title": "Guy rest", + "release_year": 1997, + "genres": [ + "Sci-Fi", + "Horror" + ], + "director": "Steven Miles", + "cast": [ + "Michael Fox", + "Tyler Acosta", + "Tracy Adams" + ], + "runtime": 122, + "rating": 7.8 + } +] \ No newline at end of file diff --git a/tests/accuracy/test-data-dumps/mflix.shows.json b/tests/accuracy/test-data-dumps/mflix.shows.json new file mode 100644 index 00000000..e91c26bb --- /dev/null +++ b/tests/accuracy/test-data-dumps/mflix.shows.json @@ -0,0 +1,750 @@ +[ + { + "_id": "b586e37c-6b32-417d-a53c-2a4c1121b11b", + "title": "Object-based analyzing architecture", + "seasons": 8, + "episodes": 62, + "platform": "Amazon Prime", + "genres": [ + "Comedy" + ], + "cast": [ + "Roger Gomez", + "Sandra Williams", + "Matthew Rodriguez", + "Scott Brown", + "Kristie Horn", + "Nicole Avila" + ], + "start_year": 2014, + "end_year": null + }, + { + "_id": "c28471ea-336f-4060-9b18-0bbff3de6622", + "title": "Customer-focused encompassing architecture", + "seasons": 4, + "episodes": 108, + "platform": "Hulu", + "genres": [ + "Thriller" + ], + "cast": [ + "Joseph Holmes", + "Patrick Smith", + "Charles Delacruz" + ], + "start_year": 2001, + "end_year": null + }, + { + "_id": "93f0969b-2377-4531-9c4e-45d2593015cd", + "title": "User-centric background approach", + "seasons": 6, + "episodes": 49, + "platform": "HBO", + "genres": [ + "Comedy", + "Documentary" + ], + "cast": [ + "Jason Castillo", + "Jessica Burke", + "Philip Lewis", + "Philip Goodman", + "Corey Lee" + ], + "start_year": 2016, + "end_year": 2018 + }, + { + "_id": "a0b76db0-99a1-49fe-a5ea-fe802a66bde9", + "title": "Networked directional budgetary management", + "seasons": 5, + "episodes": 23, + "platform": "Amazon Prime", + "genres": [ + "Comedy", + "Thriller" + ], + "cast": [ + "Mark Allen", + "Anthony Snyder", + "Kimberly Jones" + ], + "start_year": 2002, + "end_year": null + }, + { + "_id": "fbdef9b9-1ad4-4a6b-a39a-2e0b90423cb5", + "title": "Enterprise-wide dynamic intranet", + "seasons": 1, + "episodes": 12, + "platform": "Amazon Prime", + "genres": [ + "Crime", + "Documentary" + ], + "cast": [ + "Matthew Green", + "Kelly Wright", + "Tonya Sullivan", + "Daniel Brown" + ], + "start_year": 2009, + "end_year": 2020 + }, + { + "_id": "db54ab5c-bf6b-48ea-8272-1b1a4a76b848", + "title": "Exclusive real-time access", + "seasons": 10, + "episodes": 76, + "platform": "Amazon Prime", + "genres": [ + "Drama" + ], + "cast": [ + "Stacey Shaw", + "Zachary Steele", + "Laurie Martinez" + ], + "start_year": 2011, + "end_year": 2020 + }, + { + "_id": "53869b62-c8c7-48b3-86c9-17c935b43ff6", + "title": "Persevering leadingedge application", + "seasons": 5, + "episodes": 73, + "platform": "HBO", + "genres": [ + "Thriller" + ], + "cast": [ + "Diane Boyd", + "Anna Rubio", + "Cheryl Fisher", + "Tyler Villa" + ], + "start_year": 2008, + "end_year": 2020 + }, + { + "_id": "3be07c4d-5275-4181-b2f6-5b1a1e46aa7b", + "title": "Multi-lateral analyzing model", + "seasons": 2, + "episodes": 114, + "platform": "Amazon Prime", + "genres": [ + "Fantasy" + ], + "cast": [ + "Kathleen Marshall", + "Kimberly Quinn", + "Steven Parker", + "Adrienne Green", + "Justin Hughes", + "Jean Smith" + ], + "start_year": 2017, + "end_year": 2023 + }, + { + "_id": "50cb455b-5ec0-4e68-8601-43e58defb762", + "title": "User-centric tangible monitoring", + "seasons": 3, + "episodes": 55, + "platform": "Disney+", + "genres": [ + "Drama" + ], + "cast": [ + "Barbara Clark", + "Carolyn Scott", + "Timothy Reed", + "Cory Burton", + "Jacob Hill" + ], + "start_year": 2006, + "end_year": 2012 + }, + { + "_id": "bab2dba4-88bd-4b24-afce-8781eb280d53", + "title": "Persevering background monitoring", + "seasons": 4, + "episodes": 61, + "platform": "Amazon Prime", + "genres": [ + "Comedy", + "Fantasy" + ], + "cast": [ + "Adam Lin", + "Evan Smith", + "Christine Howard", + "Ruben Hopkins" + ], + "start_year": 2006, + "end_year": 2023 + }, + { + "_id": "518f2ad9-bb65-4228-8d4c-7a62b9f88599", + "title": "Cross-group intangible architecture", + "seasons": 1, + "episodes": 90, + "platform": "HBO", + "genres": [ + "Comedy" + ], + "cast": [ + "Eric Ryan", + "Ashley Ball", + "Douglas Barton", + "Brian Whitehead", + "Michael Greer" + ], + "start_year": 2018, + "end_year": null + }, + { + "_id": "d5f9304d-567d-4335-b43c-ec4034d7009f", + "title": "Programmable bottom-line monitoring", + "seasons": 10, + "episodes": 69, + "platform": "Hulu", + "genres": [ + "Documentary", + "Fantasy" + ], + "cast": [ + "Mrs. Olivia Booth", + "William Murphy", + "Patricia Payne", + "Lisa Estes", + "Jason Martin", + "Jeff Greene" + ], + "start_year": 2011, + "end_year": 2024 + }, + { + "_id": "27718a30-6e42-47ad-8adf-1533b9b8a419", + "title": "Multi-lateral multi-tasking contingency", + "seasons": 3, + "episodes": 89, + "platform": "Disney+", + "genres": [ + "Crime" + ], + "cast": [ + "Elizabeth Lambert", + "Corey Hughes", + "Melissa Stephens" + ], + "start_year": 2006, + "end_year": null + }, + { + "_id": "defc7620-3b4e-46ff-a949-bec1af753812", + "title": "Focused zero administration migration", + "seasons": 9, + "episodes": 73, + "platform": "Disney+", + "genres": [ + "Documentary", + "Drama" + ], + "cast": [ + "Shane Richardson", + "Lisa Cooper", + "Samantha Perkins" + ], + "start_year": 2008, + "end_year": null + }, + { + "_id": "9d6781fb-d095-4a00-932d-3f1fac1b0049", + "title": "Horizontal methodical encoding", + "seasons": 8, + "episodes": 40, + "platform": "Netflix", + "genres": [ + "Crime" + ], + "cast": [ + "Patricia Barrett", + "Scott Gonzalez", + "Michaela Johnson" + ], + "start_year": 2006, + "end_year": null + }, + { + "_id": "ac19b1b1-2bf9-4093-83fa-60411aa3f80f", + "title": "Enterprise-wide analyzing product", + "seasons": 8, + "episodes": 61, + "platform": "Hulu", + "genres": [ + "Drama" + ], + "cast": [ + "Christie Waters", + "Casey Allen", + "Nicole Frank" + ], + "start_year": 2001, + "end_year": 2005 + }, + { + "_id": "2dfd2240-dc9f-439f-9e06-b1ec8de397bf", + "title": "Compatible well-modulated extranet", + "seasons": 10, + "episodes": 89, + "platform": "Hulu", + "genres": [ + "Drama" + ], + "cast": [ + "Pedro Butler", + "Christian Hall", + "Dawn Gregory", + "Shannon Russell", + "Omar Mullins", + "Ian Ramos" + ], + "start_year": 2012, + "end_year": 2013 + }, + { + "_id": "94db1534-7163-430e-83e3-6a75bc6aec0f", + "title": "User-centric tangible infrastructure", + "seasons": 5, + "episodes": 11, + "platform": "Hulu", + "genres": [ + "Drama" + ], + "cast": [ + "Deborah Garcia", + "Michelle Barajas", + "Melissa Reynolds", + "Douglas Wilson" + ], + "start_year": 2001, + "end_year": null + }, + { + "_id": "65b2213f-a606-42d8-b845-0199ba2e9b82", + "title": "Inverse optimal circuit", + "seasons": 1, + "episodes": 29, + "platform": "Amazon Prime", + "genres": [ + "Fantasy", + "Documentary" + ], + "cast": [ + "Grace Rodriguez", + "Alison Greene", + "Michael Allen", + "Steven Hayden" + ], + "start_year": 2013, + "end_year": null + }, + { + "_id": "5a8a2745-e57c-4086-aa09-84131f40149f", + "title": "Public-key discrete alliance", + "seasons": 9, + "episodes": 111, + "platform": "Disney+", + "genres": [ + "Documentary" + ], + "cast": [ + "Emily Irwin", + "Olivia Gibson", + "Jean Hernandez", + "Michael Cummings" + ], + "start_year": 2013, + "end_year": 2022 + }, + { + "_id": "51326558-2080-4615-a583-b4f2fbd15600", + "title": "Managed zero administration groupware", + "seasons": 8, + "episodes": 108, + "platform": "Hulu", + "genres": [ + "Drama", + "Crime" + ], + "cast": [ + "Karen Phillips", + "Kelly Marsh", + "Daniel Hamilton", + "Abigail Smith" + ], + "start_year": 2018, + "end_year": 2019 + }, + { + "_id": "87a2cd5f-75ee-4650-b2a4-a56384c97137", + "title": "Reverse-engineered static initiative", + "seasons": 6, + "episodes": 66, + "platform": "Amazon Prime", + "genres": [ + "Crime", + "Documentary" + ], + "cast": [ + "Bradley Chavez", + "Catherine Horn", + "Joseph Bryant", + "Tara Rodriguez" + ], + "start_year": 2003, + "end_year": 2006 + }, + { + "_id": "0f647458-d09f-4be8-b1dc-49be1ba1e104", + "title": "Fundamental tangible matrices", + "seasons": 9, + "episodes": 22, + "platform": "Hulu", + "genres": [ + "Drama" + ], + "cast": [ + "Eric Lee", + "Patrick Estrada", + "Kelsey Brown", + "Jeffrey Lewis" + ], + "start_year": 2001, + "end_year": null + }, + { + "_id": "53d34237-0e86-4a5e-922b-0589c2e65458", + "title": "Self-enabling homogeneous infrastructure", + "seasons": 5, + "episodes": 35, + "platform": "Hulu", + "genres": [ + "Crime" + ], + "cast": [ + "Chad Torres", + "Mark Williams", + "Terry Mcguire", + "Kathleen Cantu", + "Harold Knapp" + ], + "start_year": 2006, + "end_year": null + }, + { + "_id": "71cc1515-ba84-4df6-92db-55af3cfa91f0", + "title": "Horizontal web-enabled application", + "seasons": 2, + "episodes": 94, + "platform": "Netflix", + "genres": [ + "Thriller", + "Fantasy" + ], + "cast": [ + "Catherine Davila", + "Jessica James", + "Cory Miller", + "Alexis Sanchez", + "Andrew Miller" + ], + "start_year": 2002, + "end_year": 2017 + }, + { + "_id": "200556f7-10c6-4414-83f7-24ef74bff12a", + "title": "User-friendly bi-directional data-warehouse", + "seasons": 2, + "episodes": 87, + "platform": "Hulu", + "genres": [ + "Drama", + "Fantasy" + ], + "cast": [ + "Tiffany Brown", + "Christina Morales", + "Samuel Blake", + "Stephanie Johnson", + "Wesley Deleon" + ], + "start_year": 2020, + "end_year": null + }, + { + "_id": "613832c9-5307-4c80-9dde-3eab4e5aa770", + "title": "Pre-emptive leadingedge capacity", + "seasons": 5, + "episodes": 56, + "platform": "Netflix", + "genres": [ + "Comedy" + ], + "cast": [ + "James Durham", + "Jessica Myers", + "Rachel King" + ], + "start_year": 2005, + "end_year": null + }, + { + "_id": "f9cb1076-3eaf-41d2-84df-057d27c1a544", + "title": "Fundamental intangible contingency", + "seasons": 4, + "episodes": 99, + "platform": "Disney+", + "genres": [ + "Crime", + "Fantasy" + ], + "cast": [ + "Robert Foster", + "Jill Barton", + "Kimberly Simmons", + "Tracey Gomez" + ], + "start_year": 2017, + "end_year": 2020 + }, + { + "_id": "f96b112f-943e-43cd-90f0-56725cfa7e59", + "title": "Diverse asymmetric forecast", + "seasons": 9, + "episodes": 24, + "platform": "Amazon Prime", + "genres": [ + "Drama", + "Crime" + ], + "cast": [ + "Carl Johnson", + "Douglas Beck", + "Kevin Guerra", + "Taylor Wilson", + "Eric Jarvis", + "Sarah Charles MD" + ], + "start_year": 2007, + "end_year": null + }, + { + "_id": "78eb682f-a03d-4cbf-bbfc-0e899e5f50d0", + "title": "Profit-focused solution-oriented Graphical User Interface", + "seasons": 10, + "episodes": 117, + "platform": "HBO", + "genres": [ + "Crime", + "Fantasy" + ], + "cast": [ + "Carol Miller", + "Jennifer Bass", + "Melanie Leblanc" + ], + "start_year": 2002, + "end_year": null + }, + { + "_id": "ebb6d3c9-3c98-4799-94bc-aadd0bf2974c", + "title": "Reduced leadingedge system engine", + "seasons": 1, + "episodes": 58, + "platform": "Hulu", + "genres": [ + "Crime", + "Drama" + ], + "cast": [ + "James Warren", + "Kelly Carter", + "Sarah Jones", + "Aaron Castaneda", + "Katherine Manning" + ], + "start_year": 2011, + "end_year": null + }, + { + "_id": "4ffd32a7-0bf4-4c95-a7c8-19002c2eb83c", + "title": "Switchable 24/7 website", + "seasons": 6, + "episodes": 71, + "platform": "Netflix", + "genres": [ + "Documentary" + ], + "cast": [ + "Sarah Brown", + "Patrick Beck", + "Angela Herrera MD", + "Steven Mcconnell" + ], + "start_year": 2018, + "end_year": null + }, + { + "_id": "37267325-4337-4912-992f-a162f9014569", + "title": "Synergized asymmetric adapter", + "seasons": 4, + "episodes": 16, + "platform": "Hulu", + "genres": [ + "Fantasy" + ], + "cast": [ + "Gabrielle Meyer", + "Madison Matthews", + "Taylor Martinez" + ], + "start_year": 2010, + "end_year": null + }, + { + "_id": "ea2abd77-c7da-443e-89fd-6f410f5d697e", + "title": "Extended contextually-based customer loyalty", + "seasons": 1, + "episodes": 79, + "platform": "Hulu", + "genres": [ + "Fantasy" + ], + "cast": [ + "Michael Lewis", + "Cassandra Hicks", + "Sydney Garcia" + ], + "start_year": 2015, + "end_year": 2023 + }, + { + "_id": "b568dd56-c083-4431-a740-4f4b5f4e1b21", + "title": "Versatile grid-enabled application", + "seasons": 7, + "episodes": 82, + "platform": "Hulu", + "genres": [ + "Crime", + "Fantasy" + ], + "cast": [ + "Keith Brown", + "Annette Johnson", + "Joseph Carroll", + "Derek Lewis" + ], + "start_year": 2006, + "end_year": 2008 + }, + { + "_id": "b6f2e1c3-6915-4e02-b1c2-44b5bec8fd68", + "title": "Operative optimizing encryption", + "seasons": 2, + "episodes": 52, + "platform": "Amazon Prime", + "genres": [ + "Fantasy", + "Drama" + ], + "cast": [ + "Garrett Mcgrath", + "Craig Jackson", + "Michael Sullivan", + "Andrew Boyer" + ], + "start_year": 2011, + "end_year": null + }, + { + "_id": "51c225d5-aa67-4b14-aca5-33757cef6bf4", + "title": "Business-focused 24/7 collaboration", + "seasons": 1, + "episodes": 113, + "platform": "Netflix", + "genres": [ + "Thriller", + "Comedy" + ], + "cast": [ + "Matthew Hill", + "Andrew White", + "Grant Young", + "John Mathews" + ], + "start_year": 2015, + "end_year": 2020 + }, + { + "_id": "7465e69f-341e-4234-8ffb-400622442a40", + "title": "Organized bi-directional application", + "seasons": 3, + "episodes": 40, + "platform": "Netflix", + "genres": [ + "Comedy" + ], + "cast": [ + "Matthew Gordon", + "Mark Allen", + "Amanda Webb", + "Jeffrey Horton", + "Sheila Lewis", + "Marcus Gilbert" + ], + "start_year": 2011, + "end_year": null + }, + { + "_id": "90570eac-f923-4c30-a5b0-661b28a8e4a5", + "title": "Configurable bottom-line success", + "seasons": 10, + "episodes": 106, + "platform": "HBO", + "genres": [ + "Fantasy", + "Drama" + ], + "cast": [ + "Elizabeth Taylor", + "Melissa Mullins", + "Alan Nguyen", + "Carolyn Kidd", + "Michael Pope" + ], + "start_year": 2015, + "end_year": null + }, + { + "_id": "06d70791-5487-4dab-8b84-a91b3376e396", + "title": "Organic dedicated analyzer", + "seasons": 3, + "episodes": 88, + "platform": "HBO", + "genres": [ + "Thriller", + "Drama" + ], + "cast": [ + "Amy Aguilar", + "James Williams", + "Kevin Kirby" + ], + "start_year": 2010, + "end_year": 2025 + } +] \ No newline at end of file diff --git a/tests/integration/tools/mongodb/mongodbHelpers.ts b/tests/integration/tools/mongodb/mongodbHelpers.ts index 86ecdd70..efbfbb75 100644 --- a/tests/integration/tools/mongodb/mongodbHelpers.ts +++ b/tests/integration/tools/mongodb/mongodbHelpers.ts @@ -2,13 +2,38 @@ import { MongoCluster } from "mongodb-runner"; import path from "path"; import { fileURLToPath } from "url"; import fs from "fs/promises"; -import { MongoClient, ObjectId } from "mongodb"; +import { Document, MongoClient, ObjectId } from "mongodb"; import { getResponseContent, IntegrationTest, setupIntegrationTest, defaultTestConfig } from "../../helpers.js"; import { UserConfig } from "../../../../src/common/config.js"; import { afterAll, afterEach, beforeAll, beforeEach, describe, expect, it } from "vitest"; const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const testDataDumpPath = path.join(__dirname, "..", "..", "..", "accuracy", "test-data-dumps"); + +const testDataPaths = [ + { + db: "comics", + collection: "books", + path: path.join(testDataDumpPath, "comics.books.json"), + }, + { + db: "comics", + collection: "characters", + path: path.join(testDataDumpPath, "comics.characters.json"), + }, + { + db: "mflix", + collection: "movies", + path: path.join(testDataDumpPath, "mflix.movies.json"), + }, + { + db: "mflix", + collection: "shows", + path: path.join(testDataDumpPath, "mflix.shows.json"), + }, +]; + interface MongoDBIntegrationTest { mongoClient: () => MongoClient; connectionString: () => string; @@ -170,3 +195,29 @@ export function validateAutoConnectBehavior( }); }); } + +export function prepareTestData(integration: MongoDBIntegrationTest) { + const testData: { + db: string; + collection: string; + data: Document[]; + }[] = []; + + beforeAll(async () => { + for (const { db, collection, path } of testDataPaths) { + testData.push({ + db, + collection, + data: JSON.parse(await fs.readFile(path, "utf8")) as Document[], + }); + } + }); + + return async function populateTestData() { + const client = integration.mongoClient(); + for (const { db, collection, data } of testData) { + await client.db(db).dropCollection(collection); + await client.db(db).collection(collection).insertMany(data); + } + }; +} From 0bd9167fa1ad927a5ce6f39c3e4dda6eebadf2ca Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Sun, 6 Jul 2025 23:49:34 +0200 Subject: [PATCH 21/91] chore: moved all existing tests to vercel mcp client --- src/tools/mongodb/read/find.ts | 2 +- tests/accuracy/collection-indexes.test.ts | 38 ++--- tests/accuracy/collection-schema.test.ts | 12 +- tests/accuracy/delete-many.test.ts | 38 ++--- tests/accuracy/find.test.ts | 150 +++++++----------- tests/accuracy/insert-many.test.ts | 51 +++--- tests/accuracy/list-collections.test.ts | 62 ++++---- tests/accuracy/list-databases.test.ts | 30 ++-- tests/accuracy/sdk/accuracy-testing-client.ts | 28 +++- tests/accuracy/sdk/describe-accuracy-tests.ts | 2 +- tests/accuracy/sdk/models.ts | 4 +- 11 files changed, 181 insertions(+), 236 deletions(-) diff --git a/src/tools/mongodb/read/find.ts b/src/tools/mongodb/read/find.ts index efad0eb9..e8a40799 100644 --- a/src/tools/mongodb/read/find.ts +++ b/src/tools/mongodb/read/find.ts @@ -13,7 +13,7 @@ export const FindArgs = { .describe("The query filter, matching the syntax of the query argument of db.collection.find()"), projection: z .record(z.string(), z.unknown()) - // .optional() + .optional() .describe("The projection, matching the syntax of the projection argument of db.collection.find()"), limit: z.number().optional().default(10).describe("The maximum number of documents to return"), sort: z diff --git a/tests/accuracy/collection-indexes.test.ts b/tests/accuracy/collection-indexes.test.ts index 683f386a..e53ddb43 100644 --- a/tests/accuracy/collection-indexes.test.ts +++ b/tests/accuracy/collection-indexes.test.ts @@ -1,42 +1,30 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { collectionIndexesResponse } from "../../src/tools/mongodb/read/collectionIndexes.js"; function callsCollectionIndexes(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "collection-indexes": function collectionIndexes() { - return collectionIndexesResponse({ - database: "db1", - collection: "coll1", - indexes: [ - { - name: "year", - key: JSON.stringify({ _id: 1 }), - }, - ], - }); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "collection-indexes", parameters: { - database: "db1", - collection: "coll1", + database: "mflix", + collection: "movies", }, }, ], }; } -describeAccuracyTests("collection-indexes", getAvailableModels(), [ - callsCollectionIndexes("How many indexes do I have in 'db1.coll1' namespace?"), - callsCollectionIndexes("List all the indexes in coll1 collection in db1 database"), - callsCollectionIndexes( - `Is the following query: ${JSON.stringify({ year: 1994 })} on the namespace 'db1.coll1' indexed?` - ), -]); +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should call 'collection-indexes' tool", [ + callsCollectionIndexes("How many indexes do I have in 'mflix.movies' namespace?"), + callsCollectionIndexes("List all the indexes in movies collection in mflix database"), + callsCollectionIndexes( + `Is the following query: ${JSON.stringify({ runtime: { $lt: 100 } })} on the namespace 'mflix.movies' indexed?` + ), + ]), +}); diff --git a/tests/accuracy/collection-schema.test.ts b/tests/accuracy/collection-schema.test.ts index e72c65de..f81273ea 100644 --- a/tests/accuracy/collection-schema.test.ts +++ b/tests/accuracy/collection-schema.test.ts @@ -1,4 +1,4 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; import { collectionSchemaResponse } from "../../src/tools/mongodb/metadata/collectionSchema.js"; @@ -41,7 +41,9 @@ function callsCollectionSchema(prompt: string): AccuracyTestConfig { }; } -describeAccuracyTests("collection-schema", getAvailableModels(), [ - callsCollectionSchema("Is there a title field in 'db1.coll1' namespace?"), - callsCollectionSchema("What is the type of value stored in title field in coll1 collection in db1 database?"), -]); +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should call 'collection-schema' tool", [ + callsCollectionSchema("Is there a title field in 'db1.coll1' namespace?"), + callsCollectionSchema("What is the type of value stored in title field in coll1 collection in db1 database?"), + ]), +}); diff --git a/tests/accuracy/delete-many.test.ts b/tests/accuracy/delete-many.test.ts index ddda1d50..4d50169d 100644 --- a/tests/accuracy/delete-many.test.ts +++ b/tests/accuracy/delete-many.test.ts @@ -1,4 +1,4 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; import { deleteManyResponse } from "../../src/tools/mongodb/delete/deleteMany.js"; @@ -7,17 +7,13 @@ function callsDeleteManyWithEmptyFilters(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "delete-many": function listDatabases() { - return deleteManyResponse("coll1", 10); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "delete-many", parameters: { - database: "db1", - collection: "coll1", + database: "mflix", + collection: "movies", }, }, ], @@ -28,26 +24,26 @@ function callsDeleteManyWithFilters(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "delete-many": function listDatabases() { - return deleteManyResponse("coll1", 10); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "delete-many", parameters: { - database: "db1", - collection: "coll1", - filters: { provider: "BongoDB" }, + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 } }, }, }, ], }; } -describeAccuracyTests("delete-many", getAvailableModels(), [ - callsDeleteManyWithEmptyFilters("Delete all the documents from 'db1.coll1' namespace"), - callsDeleteManyWithEmptyFilters("Purge the collection 'coll1' in database 'db1'"), - callsDeleteManyWithFilters("Remove all the documents from namespace 'db1.coll1' where provider is 'BongoDB'"), -]); +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should call 'delete-many' tool", [ + callsDeleteManyWithEmptyFilters("Delete all the documents from 'mflix.movies' namespace"), + callsDeleteManyWithEmptyFilters("Purge the collection 'movies' in database 'mflix'"), + callsDeleteManyWithFilters( + "Remove all the documents from namespace 'mflix.movies' where runtime is less than 100" + ), + ]), +}); diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts index 0144e22b..ecfbe4f3 100644 --- a/tests/accuracy/find.test.ts +++ b/tests/accuracy/find.test.ts @@ -1,157 +1,129 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { findResponse } from "../../src/tools/mongodb/read/find.js"; -import { MockedTools } from "./sdk/test-tools.js"; -import { collectionSchemaResponse } from "../../src/tools/mongodb/metadata/collectionSchema.js"; -import { getSimplifiedSchema } from "mongodb-schema"; -const documents = [ - { - title: "book1", - author: "author1", - date_of_publish: "01.01.1990", - }, - { - title: "book2", - author: "author1", - date_of_publish: "01.01.1992", - }, - { - title: "book3", - author: "author2", - date_of_publish: "01.01.1990", - }, -]; - -function callsFindNoFilter(prompt: string): AccuracyTestConfig { +function callsFindNoFilter(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "collection-schema": async () => - collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), - find: () => findResponse("coll1", documents), - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "find", parameters: { - database: "db1", - collection: "coll1", + database, + collection, }, }, ], }; } -function callsFindWithFilter(prompt: string): AccuracyTestConfig { +function callsFindWithFilter(prompt: string, filter: Record): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "collection-schema": async () => - collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), - find: () => - findResponse( - "coll1", - documents.filter((doc) => doc.author === "author1") - ), - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "find", parameters: { - database: "db1", - collection: "coll1", - filter: { author: "author1" }, + database: "mflix", + collection: "movies", + filter: filter, }, }, ], }; } -function callsFindWithProjection(prompt: string): AccuracyTestConfig { +function callsFindWithProjection(prompt: string, projection: Record): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "collection-schema": async () => - collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), - find: () => findResponse("coll1", documents), - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "find", parameters: { - database: "db1", - collection: "coll1", - projection: { title: 1 }, + database: "mflix", + collection: "movies", + projection, }, }, ], }; } -function callsFindWithProjectionAndFilters(prompt: string): AccuracyTestConfig { +function callsFindWithProjectionAndFilters( + prompt: string, + filter: Record, + projection: Record +): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "collection-schema": async () => - collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), - find: () => - findResponse( - "coll1", - documents.filter((doc) => doc.date_of_publish === "01.01.1992") - ), - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "find", parameters: { - database: "db1", - collection: "coll1", - filter: { date_of_publish: "01.01.1992" }, - projection: { title: 1 }, + database: "mflix", + collection: "movies", + filter, + projection, }, }, ], }; } -function callsFindWithSortAndLimit(prompt: string): AccuracyTestConfig { +function callsFindWithFilterSortAndLimit( + prompt: string, + filter: Record, + sort: Record, + limit: number +): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "collection-schema": async () => - collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), - find: () => findResponse("coll1", [documents[0], documents[1]]), - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "find", parameters: { - database: "db1", - collection: "coll1", - sort: { date_of_publish: 1 }, - limit: 2, + database: "mflix", + collection: "movies", + filter, + sort, + limit, }, }, ], }; } -describeAccuracyTests("find", getAvailableModels(), [ - callsFindNoFilter("List all the documents in 'db1.coll1' namespace"), - callsFindNoFilter("Find all the documents from collection coll1 in database db1"), - callsFindWithFilter("Find all the books published by author name 'author1' in db1.coll1 namespace"), - callsFindWithFilter("Find all the documents in coll1 collection and db1 database where author is 'author1'"), - callsFindWithProjection("Give me all the title of the books available in 'db1.coll1' namespace"), - callsFindWithProjection("Give me all the title of the books published in available in 'db1.coll1' namespace"), - callsFindWithProjectionAndFilters( - "Find all the book titles from 'db1.coll1' namespace where date_of_publish is '01.01.1992'" - ), - callsFindWithSortAndLimit("List first two books sorted by the field date_of_publish in namespace db1.coll1"), -]); +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call find tool", [ + callsFindNoFilter("List all the movies in 'mflix.movies' namespace."), + callsFindNoFilter("List all the documents in 'comics.books' namespace.", "comics", "books"), + callsFindWithFilter("Find all the movies in 'mflix.movies' namespace with runtime less than 100.", { + runtime: { $lt: 100 }, + }), + callsFindWithFilter("Find all movies in 'mflix.movies' collection where director is 'Christina Collins'", { + director: "Christina Collins", + }), + callsFindWithProjection("Give me all the movie titles available in 'mflix.movies' namespace", { title: 1 }), + callsFindWithProjectionAndFilters( + "Use 'mflix.movies' namespace to answer who were casted in the movie 'Certain Fish'", + { title: "Certain Fish" }, + { cast: 1 } + ), + callsFindWithFilterSortAndLimit( + "From the mflix.movies namespace, give me first 2 movies of Horror genre sorted ascending by their runtime", + { genres: "Horror" }, + { runtime: 1 }, + 2 + ), + ]), +}); diff --git a/tests/accuracy/insert-many.test.ts b/tests/accuracy/insert-many.test.ts index b720ac1c..25d60017 100644 --- a/tests/accuracy/insert-many.test.ts +++ b/tests/accuracy/insert-many.test.ts @@ -1,35 +1,30 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { insertManyResponse } from "../../src/tools/mongodb/create/insertMany.js"; function callsInsertMany(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "insert-many": function listDatabases() { - return insertManyResponse("coll1", 3, ["1FOO", "2BAR", "3BAZ"]); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "insert-many", parameters: { - database: "db1", - collection: "coll1", + database: "mflix", + collection: "movies", documents: [ { id: 1, - name: "name1", + title: "name1", }, { id: 2, - name: "name2", + title: "name2", }, { id: 3, - name: "name3", + title: "name3", }, ], }, @@ -42,17 +37,13 @@ function callsEmptyInsertMany(prompt: string) { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "insert-many": function listDatabases() { - return insertManyResponse("coll1", 3, ["1FOO", "2BAR", "3BAZ"]); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "insert-many", parameters: { - database: "db1", - collection: "coll1", + database: "mflix", + collection: "movies", documents: [{}, {}, {}], }, }, @@ -60,13 +51,15 @@ function callsEmptyInsertMany(prompt: string) { }; } -describeAccuracyTests("insert-many", getAvailableModels(), [ - callsInsertMany( - [ - "In my namespace 'db1.coll1', insert 3 documents each with the following fields:", - "- id: an incremental number starting from 1", - "- name: a string of format 'name'", - ].join("\n") - ), - callsEmptyInsertMany("Add three empty documents in collection 'coll1' inside database 'db1'"), -]); +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should call 'insert-many' tool", [ + callsInsertMany( + [ + "In my namespace 'mflix.movies', insert 3 documents each with the following fields:", + "- id: an incremental number starting from 1", + "- name: a string of format 'name'", + ].join("\n") + ), + callsEmptyInsertMany("Add three empty documents in collection 'movies' inside database 'mflix'"), + ]), +}); diff --git a/tests/accuracy/list-collections.test.ts b/tests/accuracy/list-collections.test.ts index ac086859..a8455418 100644 --- a/tests/accuracy/list-collections.test.ts +++ b/tests/accuracy/list-collections.test.ts @@ -1,22 +1,16 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { listCollectionsResponse } from "../../src/tools/mongodb/metadata/listCollections.js"; -import { listDatabasesResponse } from "../../src/tools/mongodb/metadata/listDatabases.js"; function callsListCollections(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "list-collections": function listCollections() { - return listCollectionsResponse("db1", ["coll1", "coll2"]); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "list-collections", - parameters: { database: "db1" }, + parameters: { database: "mflix" }, }, ], }; @@ -26,23 +20,7 @@ function callsListDatabasesAndListCollections(prompt: string): AccuracyTestConfi return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "list-collections": function listCollections() { - return listCollectionsResponse("db1", ["coll1", "coll2"]); - }, - "list-databases": function listDatabases() { - return listDatabasesResponse([ - { - name: "db1", - sizeOnDisk: "1024", - }, - { - name: "db2", - sizeOnDisk: "2048", - }, - ]); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "list-databases", @@ -50,19 +28,35 @@ function callsListDatabasesAndListCollections(prompt: string): AccuracyTestConfi }, { toolName: "list-collections", - parameters: { database: "db1" }, + parameters: { database: "admin" }, + }, + { + toolName: "list-collections", + parameters: { database: "comics" }, + }, + { + toolName: "list-collections", + parameters: { database: "config" }, + }, + { + toolName: "list-collections", + parameters: { database: "local" }, }, { toolName: "list-collections", - parameters: { database: "db2" }, + parameters: { database: "mflix" }, }, ], }; } -describeAccuracyTests("list-collections", getAvailableModels(), [ - callsListCollections("How many collections do I have in database db1?"), - callsListCollections("List all the collections in my MongoDB database db1."), - callsListCollections("Is there a coll1 collection in my MongoDB database db1?"), - callsListDatabasesAndListCollections("List all the collections that I have in total on my cluster?"), -]); +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call list-collections tool", [ + callsListCollections("How many collections do I have in database mflix?"), + callsListCollections("List all the collections in my MongoDB database mflix."), + callsListCollections("Is there a shows collection in my MongoDB database mflix?"), + ]), + ...describeSuite("should call list-databases and list-collections tool", [ + callsListDatabasesAndListCollections("List all the collections that I have in total on my cluster?"), + ]), +}); diff --git a/tests/accuracy/list-databases.test.ts b/tests/accuracy/list-databases.test.ts index 0a89db1d..0ef88712 100644 --- a/tests/accuracy/list-databases.test.ts +++ b/tests/accuracy/list-databases.test.ts @@ -1,26 +1,12 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { listDatabasesResponse } from "../../src/tools/mongodb/metadata/listDatabases.js"; function callsListDatabases(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "list-databases": function listDatabases() { - return listDatabasesResponse([ - { - name: "db1", - sizeOnDisk: "1024", - }, - { - name: "db2", - sizeOnDisk: "2048", - }, - ]); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "list-databases", @@ -30,8 +16,10 @@ function callsListDatabases(prompt: string): AccuracyTestConfig { }; } -describeAccuracyTests("list-databases", getAvailableModels(), [ - callsListDatabases("How many databases do I have?"), - callsListDatabases("List all the databases in my cluster."), - callsListDatabases("Is there a sample_mflix database in my cluster?"), -]); +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call list-databases tool", [ + callsListDatabases("How many databases do I have?"), + callsListDatabases("List all the databases that I have in my clusters"), + callsListDatabases("Is there a mflix database in my cluster?"), + ]), +}); diff --git a/tests/accuracy/sdk/accuracy-testing-client.ts b/tests/accuracy/sdk/accuracy-testing-client.ts index de7a0671..b12017d7 100644 --- a/tests/accuracy/sdk/accuracy-testing-client.ts +++ b/tests/accuracy/sdk/accuracy-testing-client.ts @@ -17,14 +17,14 @@ export type MockedTools = Record; export class AccuracyTestingClient { private mockedTools: MockedTools = {}; private recordedToolCalls: ToolCall[] = []; - private constructor(private readonly client: Awaited>) {} + private constructor(private readonly vercelMCPClient: Awaited>) {} async close() { - await this.client?.close(); + await this.vercelMCPClient?.close(); } async vercelTools() { - const vercelTools = (await this.client?.tools()) ?? {}; + const vercelTools = (await this.vercelMCPClient?.tools()) ?? {}; const rewrappedVercelTools: typeof vercelTools = {}; for (const [toolName, tool] of Object.entries(vercelTools)) { rewrappedVercelTools[toolName] = createVercelTool({ @@ -35,12 +35,24 @@ export class AccuracyTestingClient { toolName: toolName, parameters: args, }); - const toolResultGeneratorFn = this.mockedTools[toolName]; - if (toolResultGeneratorFn) { - return await toolResultGeneratorFn(args); - } + try { + const toolResultGeneratorFn = this.mockedTools[toolName]; + if (toolResultGeneratorFn) { + return await toolResultGeneratorFn(args); + } - return tool.execute(args, options); + return await tool.execute(args, options); + } catch (error) { + // There are cases when LLM calls the tools incorrectly + // and the schema definition check fails. Normally a + // tool calling agent will handle the error case but + // because we are wrapping the tool definition ourselves + // we have to handle this ourselves as well. + return { + isError: true, + content: JSON.stringify(error), + }; + } }, }); } diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index dd224387..466a9ed7 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -69,7 +69,7 @@ export function describeAccuracyTests( toolCalls ); console.debug(testConfig.prompt); - console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); + // console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); // console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2)); console.debug( "Tool calling accuracy: %s, Parameter Accuracy: %s", diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index 1fe4fd58..eb7f4b91 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -67,9 +67,9 @@ export class OllamaModel implements Model { } const ALL_TESTABLE_MODELS = [ - new GeminiModel("gemini-2.0-flash"), + // new GeminiModel("gemini-2.0-flash"), // new OpenAIModel("gpt-4o"), - // new AzureOpenAIModel("gpt-4o"), + new AzureOpenAIModel("gpt-4o"), // new OllamaModel("qwen3:1.7b"), ]; From efefd9dd09bc6064a50c4ea3442e72b47d3972c6 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 7 Jul 2025 15:50:18 +0200 Subject: [PATCH 22/91] chore: adds tests for the rest of the tools --- .../accuracy/collection-storage-size.test.ts | 51 ++++++++++++ tests/accuracy/count.test.ts | 60 ++++++++++++++ tests/accuracy/create-collection.test.ts | 57 +++++++++++++ tests/accuracy/db-stats.test.ts | 25 ++++++ tests/accuracy/drop-collection.test.ts | 82 +++++++++++++++++++ tests/accuracy/drop-database.test.ts | 50 +++++++++++ tests/accuracy/explain.test.ts | 72 ++++++++++++++++ tests/accuracy/logs.test.ts | 31 +++++++ tests/accuracy/rename-collection.test.ts | 49 +++++++++++ tests/accuracy/sdk/agent.ts | 2 +- tests/accuracy/sdk/describe-accuracy-tests.ts | 3 +- tests/accuracy/update-many.test.ts | 60 ++++++++++++++ .../tools/mongodb/mongodbHelpers.ts | 24 ++++-- 13 files changed, 558 insertions(+), 8 deletions(-) create mode 100644 tests/accuracy/collection-storage-size.test.ts create mode 100644 tests/accuracy/count.test.ts create mode 100644 tests/accuracy/create-collection.test.ts create mode 100644 tests/accuracy/db-stats.test.ts create mode 100644 tests/accuracy/drop-collection.test.ts create mode 100644 tests/accuracy/drop-database.test.ts create mode 100644 tests/accuracy/explain.test.ts create mode 100644 tests/accuracy/logs.test.ts create mode 100644 tests/accuracy/rename-collection.test.ts create mode 100644 tests/accuracy/update-many.test.ts diff --git a/tests/accuracy/collection-storage-size.test.ts b/tests/accuracy/collection-storage-size.test.ts new file mode 100644 index 00000000..751b84d6 --- /dev/null +++ b/tests/accuracy/collection-storage-size.test.ts @@ -0,0 +1,51 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; + +function callsCollectionStorageSize(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: expectedToolCalls, + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call 'collection-storage-size' tool", [ + callsCollectionStorageSize("What is the size of 'mflix.movies' namespace", [ + { + toolName: "collection-storage-size", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ]), + ]), + ...describeSuite("should call 'collection-storage-size' tool after another tool/s", [ + callsCollectionStorageSize("How much size is each collection in comics database", [ + { + toolName: "list-collections", + parameters: { + database: "comics", + }, + }, + { + toolName: "collection-storage-size", + parameters: { + database: "comics", + collection: "books", + }, + }, + { + toolName: "collection-storage-size", + parameters: { + database: "comics", + collection: "characters", + }, + }, + ]), + ]), +}); diff --git a/tests/accuracy/count.test.ts b/tests/accuracy/count.test.ts new file mode 100644 index 00000000..0543af76 --- /dev/null +++ b/tests/accuracy/count.test.ts @@ -0,0 +1,60 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsCountToolWithEmptyQuery(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "count", + parameters: { + database, + collection, + }, + }, + ], + }; +} + +function callsCountToolWithQuery( + prompt: string, + database = "mflix", + collection = "movies", + query: Record = {} +): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "count", + parameters: { + database, + collection, + query, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call 'count' tool", [ + callsCountToolWithEmptyQuery("Count number of documents in 'mflix.movies' namespace."), + callsCountToolWithEmptyQuery( + "How many documents are there in 'characters' collection in 'comics' database?", + "comics", + "characters" + ), + callsCountToolWithQuery( + "Count all the documents in 'mflix.movies' namespace with runtime less than 100?", + "mflix", + "movies", + { runtime: { $lt: 100 } } + ), + ]), +}); diff --git a/tests/accuracy/create-collection.test.ts b/tests/accuracy/create-collection.test.ts new file mode 100644 index 00000000..ab468a62 --- /dev/null +++ b/tests/accuracy/create-collection.test.ts @@ -0,0 +1,57 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; + +function callsCreateCollection(prompt: string, database: string, collection: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "create-collection", + parameters: { + database, + collection, + }, + }, + ], + }; +} + +function callsCreateCollectionWithListCollections(prompt: string, expectedToolCalls: ExpectedToolCall[]) { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls, + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call 'create-collection' tool", [ + callsCreateCollection("Create a new namespace 'mflix.documentaries'", "mflix", "documentaries"), + callsCreateCollection("Create a new collection villains in comics database", "comics", "villains"), + ]), + ...describeSuite("should call 'create-collection' alongside other required tools", [ + callsCreateCollectionWithListCollections( + "If and only if, the namespace 'mflix.documentaries' does not exist, then create it", + [ + { + toolName: "list-collections", + parameters: { + database: "mflix", + }, + }, + { + toolName: "create-collection", + parameters: { + database: "mflix", + collection: "documentaries", + }, + }, + ] + ), + ]), +}); diff --git a/tests/accuracy/db-stats.test.ts b/tests/accuracy/db-stats.test.ts new file mode 100644 index 00000000..b88fbb3c --- /dev/null +++ b/tests/accuracy/db-stats.test.ts @@ -0,0 +1,25 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsListDatabases(prompt: string, database = "mflix"): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "db-stats", + parameters: { + database, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call 'db-stats' tool", [ + callsListDatabases("What is the size occupied by database mflix?"), + ]), +}); diff --git a/tests/accuracy/drop-collection.test.ts b/tests/accuracy/drop-collection.test.ts new file mode 100644 index 00000000..e51494b7 --- /dev/null +++ b/tests/accuracy/drop-collection.test.ts @@ -0,0 +1,82 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; + +function onlyCallsDropCollection(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "drop-collection", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ], + }; +} + +function callsDropCollection(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls, + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call 'drop-collection' tool", [ + onlyCallsDropCollection("Remove mflix.movies namespace from my cluster."), + onlyCallsDropCollection("Drop movies collection from mflix database."), + ]), + ...describeSuite("should call 'drop-collection' after calling other necessary tools", [ + callsDropCollection("Remove books collection from which ever database contains it.", [ + { + toolName: "list-databases", + parameters: {}, + }, + { + toolName: "list-collections", + parameters: { + database: "admin", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "comics", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "config", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "local", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "mflix", + }, + }, + { + toolName: "drop-collection", + parameters: { + database: "comics", + collection: "books", + }, + }, + ]), + ]), +}); diff --git a/tests/accuracy/drop-database.test.ts b/tests/accuracy/drop-database.test.ts new file mode 100644 index 00000000..08ffe640 --- /dev/null +++ b/tests/accuracy/drop-database.test.ts @@ -0,0 +1,50 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; + +function onlyCallsDropDatabase(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "drop-database", + parameters: { + database: "mflix", + }, + }, + ], + }; +} + +function callsDropDatabase(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls, + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call 'drop-database' tool", [ + onlyCallsDropDatabase("Remove mflix database from my cluster."), + onlyCallsDropDatabase("Drop database named mflix."), + ]), + ...describeSuite("should call 'drop-database' after calling other necessary tools", [ + callsDropDatabase("If there is a mflix database in my cluster then drop it.", [ + { + toolName: "list-databases", + parameters: {}, + }, + { + toolName: "drop-database", + parameters: { + database: "mflix", + }, + }, + ]), + ]), +}); diff --git a/tests/accuracy/explain.test.ts b/tests/accuracy/explain.test.ts new file mode 100644 index 00000000..6e767981 --- /dev/null +++ b/tests/accuracy/explain.test.ts @@ -0,0 +1,72 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsExplain(prompt: string, method: Record): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "explain", + parameters: { + database: "mflix", + collection: "movies", + method: [method], + }, + }, + ], + }; +} + +const callsExplainWithFind = (prompt: string) => + callsExplain(prompt, { + name: "find", + arguments: { + filter: { release_year: 2020 }, + }, + }); + +const callsExplainWithAggregate = (prompt: string) => + callsExplain(prompt, { + name: "aggregate", + arguments: { + pipeline: [ + { + $match: { release_year: 2020 }, + }, + ], + }, + }); + +const callsExplainWithCount = (prompt: string) => + callsExplain(prompt, { + name: "count", + arguments: { + query: { release_year: 2020 }, + }, + }); + +/** + * None of these tests score a parameter match on any of the models, likely + * because we are using Zod.union, when we probably should've used + * Zod.discriminatedUnion + */ +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should call 'explain' tool for a find query", [ + callsExplainWithFind( + `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` + ), + ]), + ...describeSuite("should call 'explain' tool for an aggregation", [ + callsExplainWithAggregate( + `Will aggregating documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` + ), + ]), + ...describeSuite("should call 'explain' tool for count", [ + callsExplainWithCount( + `Will counting documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` + ), + ]), +}); diff --git a/tests/accuracy/logs.test.ts b/tests/accuracy/logs.test.ts new file mode 100644 index 00000000..afd2a697 --- /dev/null +++ b/tests/accuracy/logs.test.ts @@ -0,0 +1,31 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; + +function callsLogsTool(prompt: string, toolCall: ExpectedToolCall): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [toolCall], + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should call 'logs' tool", [ + callsLogsTool("Were there any startup warnings for my MongoDB server?", { + toolName: "mongodb-logs", + parameters: { + type: "startupWarnings", + }, + }), + callsLogsTool("Retrieve first 10 logs for my MongoDB server?", { + toolName: "mongodb-logs", + parameters: { + type: "global", + limit: 10, + }, + }), + ]), +}); diff --git a/tests/accuracy/rename-collection.test.ts b/tests/accuracy/rename-collection.test.ts new file mode 100644 index 00000000..d8d46025 --- /dev/null +++ b/tests/accuracy/rename-collection.test.ts @@ -0,0 +1,49 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsRenameCollection(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "rename-collection", + parameters: { + database: "mflix", + collection: "movies", + newName: "new_movies", + }, + }, + ], + }; +} + +function callsRenameCollectionWithDropTarget(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "rename-collection", + parameters: { + database: "mflix", + collection: "movies", + newName: "new_movies", + dropTarget: true, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call 'rename-collection' tool", [ + callsRenameCollection("Rename my 'mflix.movies' namespace to 'mflix.new_movies'"), + callsRenameCollectionWithDropTarget( + "Rename my 'mflix.movies' namespace to 'mflix.new_movies' while removing the old namespace." + ), + ]), +}); diff --git a/tests/accuracy/sdk/agent.ts b/tests/accuracy/sdk/agent.ts index eb680358..6997ffb6 100644 --- a/tests/accuracy/sdk/agent.ts +++ b/tests/accuracy/sdk/agent.ts @@ -6,7 +6,7 @@ const systemPrompt = [ "You are an expert AI assistant with access to a set of tools for MongoDB database operations.", "You MUST use the most relevant tool to answer the user's request", "When calling a tool, you MUST strictly follow its input schema and MUST provide all required arguments", - "If a task requires multiple steps, you MUST call the necessary tools in sequence", + "If a task requires multiple tool calls, you MUST call all the necessary tools in sequence, following the requirements mentioned above for each tool called.", 'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"', ]; diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 466a9ed7..7a49b550 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -33,7 +33,7 @@ export function describeAccuracyTests( eachModel(`$modelName`, function (model) { const mdbIntegration = setupMongoDBIntegrationTest(); - const populateTestData = prepareTestData(mdbIntegration); + const { populateTestData, cleanupTestDatabases } = prepareTestData(mdbIntegration); let testMCPClient: AccuracyTestingClient; let agent: Agent; @@ -44,6 +44,7 @@ export function describeAccuracyTests( }); beforeEach(async () => { + await cleanupTestDatabases(mdbIntegration); await populateTestData(); testMCPClient.resetForTests(); }); diff --git a/tests/accuracy/update-many.test.ts b/tests/accuracy/update-many.test.ts new file mode 100644 index 00000000..4b82fbfb --- /dev/null +++ b/tests/accuracy/update-many.test.ts @@ -0,0 +1,60 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsUpdateManyWithEmptyFilters(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "update-many", + parameters: { + database: "mflix", + collection: "movies", + update: { + $set: { + new_field: 1, + }, + }, + }, + }, + ], + }; +} + +function callsUpdateManyWithFilters(prompt: string, filter: Record): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "update-many", + parameters: { + database: "mflix", + collection: "movies", + filter, + update: { + $set: { + new_field: 1, + }, + }, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call aggregate tool", [ + callsUpdateManyWithEmptyFilters( + "Update all the documents in 'mflix.movies' namespace with a new field 'new_field' set to 1" + ), + callsUpdateManyWithFilters( + "Update all the documents in 'mflix.movies' namespace, where runtime is less than 100, with a new field 'new_field' set to 1", + { runtime: { $lt: 100 } } + ), + ]), +}); diff --git a/tests/integration/tools/mongodb/mongodbHelpers.ts b/tests/integration/tools/mongodb/mongodbHelpers.ts index efbfbb75..05cee212 100644 --- a/tests/integration/tools/mongodb/mongodbHelpers.ts +++ b/tests/integration/tools/mongodb/mongodbHelpers.ts @@ -197,6 +197,7 @@ export function validateAutoConnectBehavior( } export function prepareTestData(integration: MongoDBIntegrationTest) { + const NON_TEST_DBS = ["admin", "config", "local"]; const testData: { db: string; collection: string; @@ -213,11 +214,22 @@ export function prepareTestData(integration: MongoDBIntegrationTest) { } }); - return async function populateTestData() { - const client = integration.mongoClient(); - for (const { db, collection, data } of testData) { - await client.db(db).dropCollection(collection); - await client.db(db).collection(collection).insertMany(data); - } + return { + async populateTestData(this: void) { + const client = integration.mongoClient(); + for (const { db, collection, data } of testData) { + await client.db(db).collection(collection).insertMany(data); + } + }, + async cleanupTestDatabases(this: void, integration: MongoDBIntegrationTest) { + const client = integration.mongoClient(); + const admin = client.db().admin(); + const databases = await admin.listDatabases(); + await Promise.all( + databases.databases + .filter(({ name }) => !NON_TEST_DBS.includes(name)) + .map(({ name }) => client.db(name).dropDatabase()) + ); + }, }; } From 06422a74f7da9ba4d1632b78c83596d26d30ddd3 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 7 Jul 2025 16:13:34 +0200 Subject: [PATCH 23/91] chore: adds missed out tests for tools --- tests/accuracy/aggregate.test.ts | 28 +++++++++++++++++++++++ tests/accuracy/create-index.test.ts | 35 +++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 tests/accuracy/aggregate.test.ts create mode 100644 tests/accuracy/create-index.test.ts diff --git a/tests/accuracy/aggregate.test.ts b/tests/accuracy/aggregate.test.ts new file mode 100644 index 00000000..3da1ca32 --- /dev/null +++ b/tests/accuracy/aggregate.test.ts @@ -0,0 +1,28 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsAggregate(prompt: string, pipeline: Record[]): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "aggregate", + parameters: { + pipeline: pipeline, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should call 'aggregate' tool", [ + callsAggregate( + "Group all the movies in 'mflix.movies' namespace by 'release_year' and give me a count of them", + [{ $group: { _id: "$release_year", count: { $sum: 1 } } }] + ), + ]), +}); diff --git a/tests/accuracy/create-index.test.ts b/tests/accuracy/create-index.test.ts new file mode 100644 index 00000000..82e98e92 --- /dev/null +++ b/tests/accuracy/create-index.test.ts @@ -0,0 +1,35 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsCreateIndex(prompt: string, indexKeys: Record): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "create-index", + parameters: { + database: "mflix", + collection: "movies", + keys: indexKeys, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should call 'create-index' tool", [ + callsCreateIndex( + "Create an index that covers the following query on 'mflix.movies' namespace - { \"release_year\": 1992 }", + { + release_year: 1, + } + ), + callsCreateIndex("Create a text index on title field in 'mflix.movies' namespace", { + title: "text", + }), + ]), +}); From 6039b1d5f48f9ca1d8eecd1362427dee088086af Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 11:06:13 +0200 Subject: [PATCH 24/91] chore: MongoDB based snapshot storage for accuracy runs introduces the following necessary env variables: - MDB_ACCURACY_RUN_ID: The accuracy run id - MDB_ACCURACY_MDB_URL: The connection string to mongodb instance where the snapshots will be stored - MDB_ACCURACY_MDB_DB: The database for snapshots - MDB_ACCURACY_MDB_COLLECTION: The collection for snapshots --- package-lock.json | 33 +++++++ .../get-snapshot-storage.ts | 19 ++++ .../mdb-snapshot-storage.ts | 86 +++++++++++++++++++ .../snapshot-storage.ts | 51 +++++++++++ tests/accuracy/sdk/agent.ts | 29 +++++-- tests/accuracy/sdk/describe-accuracy-tests.ts | 31 ++++--- tests/accuracy/sdk/git-info.ts | 7 ++ 7 files changed, 241 insertions(+), 15 deletions(-) create mode 100644 tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts create mode 100644 tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts create mode 100644 tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts create mode 100644 tests/accuracy/sdk/git-info.ts diff --git a/package-lock.json b/package-lock.json index 0f20cf48..235bc95d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2019,6 +2019,23 @@ "jsep": "^0.4.0||^1.0.0" } }, + "node_modules/@kwsites/file-exists": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@kwsites/file-exists/-/file-exists-1.1.1.tgz", + "integrity": "sha512-m9/5YGR18lIwxSFDwfE3oA7bWuq9kdau6ugN4H2rJeyhFQZcG9AgSHkQtSD15a8WvTgfz9aikZMrKPHvbpqFiw==", + "dev": true, + "license": "MIT", + "dependencies": { + "debug": "^4.1.1" + } + }, + "node_modules/@kwsites/promise-deferred": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@kwsites/promise-deferred/-/promise-deferred-1.1.1.tgz", + "integrity": "sha512-GaHYm+c0O9MjZRu0ongGBRbinu8gVAMd2UZjji6jVmqKtZluZnptXGWhz1E8j8D2HJ3f/yMxKAUC0b+57wncIw==", + "dev": true, + "license": "MIT" + }, "node_modules/@modelcontextprotocol/inspector": { "version": "0.16.1", "resolved": "https://registry.npmjs.org/@modelcontextprotocol/inspector/-/inspector-0.16.1.tgz", @@ -12154,6 +12171,22 @@ "simple-concat": "^1.0.0" } }, + "node_modules/simple-git": { + "version": "3.28.0", + "resolved": "https://registry.npmjs.org/simple-git/-/simple-git-3.28.0.tgz", + "integrity": "sha512-Rs/vQRwsn1ILH1oBUy8NucJlXmnnLeLCfcvbSehkPzbv3wwoFWIdtfd6Ndo6ZPhlPsCZ60CPI4rxurnwAa+a2w==", + "dev": true, + "license": "MIT", + "dependencies": { + "@kwsites/file-exists": "^1.1.1", + "@kwsites/promise-deferred": "^1.1.1", + "debug": "^4.4.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/steveukx/git-js?sponsor=1" + } + }, "node_modules/simple-oauth2": { "version": "5.1.0", "resolved": "https://registry.npmjs.org/simple-oauth2/-/simple-oauth2-5.1.0.tgz", diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts new file mode 100644 index 00000000..44c8ae3d --- /dev/null +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts @@ -0,0 +1,19 @@ +import { getCommitSHA } from "../git-info.js"; +import { MongoDBSnapshotStorage } from "./mdb-snapshot-storage.js"; +import { AccuracySnapshotStorage } from "./snapshot-storage.js"; + +export async function getAccuracySnapshotStorage(): Promise { + const accuracyRunId = process.env.MDB_ACCURACY_RUN_ID; + if (!accuracyRunId) { + throw new Error( + "Cannot create AccuracySnapshotStorage without an accuracyRunId - ensure that the relevant env variable is present." + ); + } + + const commitSHA = await getCommitSHA(); + if (!commitSHA) { + throw new Error("Cannot create AccuracySnapshotStorage without a commitSHA."); + } + + return MongoDBSnapshotStorage.getStorage(commitSHA, accuracyRunId); +} diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts new file mode 100644 index 00000000..f8296a8a --- /dev/null +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts @@ -0,0 +1,86 @@ +import { Collection, MongoClient } from "mongodb"; +import { AccuracySnapshotEntry, AccuracySnapshotEntrySchema, AccuracySnapshotStorage } from "./snapshot-storage.js"; + +export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { + private readonly client: MongoClient; + private readonly snapshotCollection: Collection; + private readonly accuracyRunId: string; + private readonly commitSHA: string; + private constructor({ + mongodbUrl, + database, + collection, + accuracyRunId, + commitSHA, + }: { + mongodbUrl: string; + database: string; + collection: string; + accuracyRunId: string; + commitSHA: string; + }) { + this.client = new MongoClient(mongodbUrl); + this.snapshotCollection = this.client.db(database).collection(collection); + this.accuracyRunId = accuracyRunId; + this.commitSHA = commitSHA; + } + + async createSnapshotEntry( + snapshotEntry: Pick< + AccuracySnapshotEntry, + | "requestedModel" + | "test" + | "prompt" + | "toolCallingAccuracy" + | "parameterAccuracy" + | "llmResponseTime" + | "tokensUsage" + | "respondingModel" + | "text" + | "messages" + > + ): Promise { + const snapshotWithMeta: AccuracySnapshotEntry = { + ...snapshotEntry, + commitSHA: this.commitSHA, + accuracyRunId: this.accuracyRunId, + createdOn: Date.now(), + }; + await this.snapshotCollection.insertOne(snapshotWithMeta); + } + + async getLastRunIdForCommit(commit: string): Promise { + const document = await this.snapshotCollection.findOne( + { commit: commit }, + { sort: { createdOn: -1 }, projection: { accuracyRunId: 1 } } + ); + + return document?.accuracyRunId ? `${document?.accuracyRunId}` : undefined; + } + + async getSnapshotEntriesForRunId(accuracyRunId: string): Promise { + const snapshotEntries = await this.snapshotCollection.find({ accuracyRunId }).toArray(); + return AccuracySnapshotEntrySchema.array().parse(snapshotEntries); + } + + static getStorage(commitSHA: string, accuracyRunId: string): MongoDBSnapshotStorage { + const mongodbUrl = process.env.MDB_ACCURACY_MDB_URL; + const database = process.env.MDB_ACCURACY_MDB_DB; + const collection = process.env.MDB_ACCURACY_MDB_COLLECTION; + if (!mongodbUrl || !database || !collection) { + throw new Error("Cannot create MongoDBAccuracySnapshot storage without relevant configuration provided"); + } + + return new MongoDBSnapshotStorage({ + mongodbUrl, + database, + collection, + commitSHA, + accuracyRunId, + }); + } + + async close(): Promise { + await this.client.close(); + } +} diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts new file mode 100644 index 00000000..a6f92807 --- /dev/null +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts @@ -0,0 +1,51 @@ +import z from "zod"; + +export const AccuracySnapshotEntrySchema = z.object({ + // Git and meta information for snapshot entries + accuracyRunId: z.string(), + createdOn: z.number(), + commitSHA: z.string(), + // Accuracy info + requestedModel: z.string(), + test: z.string(), + prompt: z.string(), + toolCallingAccuracy: z.number(), + parameterAccuracy: z.number(), + llmResponseTime: z.number(), + tokensUsage: z + .object({ + promptTokens: z.number().optional(), + completionTokens: z.number().optional(), + totalTokens: z.number().optional(), + }) + .optional(), + respondingModel: z.string(), + text: z.string(), + messages: z.array(z.record(z.string(), z.unknown())), +}); + +export type AccuracySnapshotEntry = z.infer; + +export interface AccuracySnapshotStorage { + createSnapshotEntry( + snapshotEntry: Pick< + AccuracySnapshotEntry, + | "requestedModel" + | "test" + | "prompt" + | "toolCallingAccuracy" + | "parameterAccuracy" + | "llmResponseTime" + | "tokensUsage" + | "respondingModel" + | "text" + | "messages" + > + ): Promise; + + getLastRunIdForCommit(commit: string): Promise; + + getSnapshotEntriesForRunId(accuracyRunId: string): Promise; + + close(): Promise; +} diff --git a/tests/accuracy/sdk/agent.ts b/tests/accuracy/sdk/agent.ts index 6997ffb6..4b5d2621 100644 --- a/tests/accuracy/sdk/agent.ts +++ b/tests/accuracy/sdk/agent.ts @@ -1,4 +1,4 @@ -import { generateText, Tool, Schema, LanguageModelV1 } from "ai"; +import { generateText, LanguageModelV1, experimental_createMCPClient } from "ai"; import { Model } from "./models.js"; const systemPrompt = [ @@ -10,15 +10,32 @@ const systemPrompt = [ 'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"', ]; -export interface Agent { - prompt(prompt: string, model: M, tools: T): Promise; +// Some necessary types from Vercel SDK +export type VercelMCPClient = Awaited>; +export type VercelMCPClientTools = Awaited>; +export type VercelAgent = ReturnType; + +// Generic interface for Agent, in case we need to switch to some other agent +// development SDK +export interface AgentPromptResult { + respondingModel: string; + tokensUsage?: { + promptTokens?: number; + completionTokens?: number; + totalTokens?: number; + }; + text: string; + messages: Record[]; +} +export interface Agent { + prompt(prompt: string, model: Model, tools: Tools): Promise; } export function getVercelToolCallingAgent( requestedSystemPrompt?: string -): Agent, Record>>, { text: string; messages: unknown[] }> { +): Agent, VercelMCPClientTools, AgentPromptResult> { return { - async prompt(prompt: string, model: Model, tools: Record>>) { + async prompt(prompt: string, model: Model, tools: VercelMCPClientTools) { const result = await generateText({ model: model.getModel(), system: [...systemPrompt, requestedSystemPrompt].join("\n"), @@ -29,6 +46,8 @@ export function getVercelToolCallingAgent( return { text: result.text, messages: result.response.messages, + respondingModel: result.response.modelId, + tokensUsage: result.usage, }; }, }; diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 7a49b550..5670207a 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -1,8 +1,10 @@ import { TestableModels } from "./models.js"; import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyScorer } from "./accuracy-scorers.js"; -import { Agent, getVercelToolCallingAgent } from "./agent.js"; +import { getVercelToolCallingAgent, VercelAgent } from "./agent.js"; import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/tools/mongodb/mongodbHelpers.js"; import { AccuracyTestingClient, MockedTools } from "./accuracy-testing-client.js"; +import { getAccuracySnapshotStorage } from "./accuracy-snapshot-storage/get-snapshot-storage.js"; +import { AccuracySnapshotStorage } from "./accuracy-snapshot-storage/snapshot-storage.js"; export interface AccuracyTestConfig { systemPrompt?: string; @@ -35,10 +37,12 @@ export function describeAccuracyTests( const mdbIntegration = setupMongoDBIntegrationTest(); const { populateTestData, cleanupTestDatabases } = prepareTestData(mdbIntegration); + let accuracySnapshotStorage: AccuracySnapshotStorage; let testMCPClient: AccuracyTestingClient; - let agent: Agent; + let agent: VercelAgent; beforeAll(async () => { + accuracySnapshotStorage = await getAccuracySnapshotStorage(); testMCPClient = await AccuracyTestingClient.initializeClient(mdbIntegration.connectionString()); agent = getVercelToolCallingAgent(); }); @@ -50,6 +54,7 @@ export function describeAccuracyTests( }); afterAll(async () => { + await accuracySnapshotStorage.close(); await testMCPClient.close(); }); @@ -62,21 +67,27 @@ export function describeAccuracyTests( const promptForModel = testConfig.injectConnectedAssumption ? [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" ") : testConfig.prompt; - const conversation = await agent.prompt(promptForModel, model, toolsForModel); + + const timeBeforePrompt = Date.now(); + const result = await agent.prompt(promptForModel, model, toolsForModel); + const timeAfterPrompt = Date.now(); const toolCalls = testMCPClient.getToolCalls(); const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); const parameterMatchingAccuracy = parameterMatchingAccuracyScorer( testConfig.expectedToolCalls, toolCalls ); - console.debug(testConfig.prompt); - // console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); - // console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2)); - console.debug( - "Tool calling accuracy: %s, Parameter Accuracy: %s", + + const responseTime = timeAfterPrompt - timeBeforePrompt; + await accuracySnapshotStorage.createSnapshotEntry({ + requestedModel: model.modelName, + test: suiteName, + prompt: testConfig.prompt, + llmResponseTime: responseTime, toolCallingAccuracy, - parameterMatchingAccuracy - ); + parameterAccuracy: parameterMatchingAccuracy, + ...result, + }); }); }); }); diff --git a/tests/accuracy/sdk/git-info.ts b/tests/accuracy/sdk/git-info.ts new file mode 100644 index 00000000..03e34a7d --- /dev/null +++ b/tests/accuracy/sdk/git-info.ts @@ -0,0 +1,7 @@ +import { simpleGit } from "simple-git"; + +export async function getCommitSHA(): Promise { + const commitLogs = await simpleGit().log(); + const lastCommit = commitLogs.latest; + return lastCommit?.hash; +} From 8b39a1cea30b17064c5ef8656e03409a40585e24 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 11:06:49 +0200 Subject: [PATCH 25/91] chore: remove file based snapshot --- tests/accuracy/sdk/accuracy-snapshot.ts | 54 ------------------------- 1 file changed, 54 deletions(-) delete mode 100644 tests/accuracy/sdk/accuracy-snapshot.ts diff --git a/tests/accuracy/sdk/accuracy-snapshot.ts b/tests/accuracy/sdk/accuracy-snapshot.ts deleted file mode 100644 index 1f7867a9..00000000 --- a/tests/accuracy/sdk/accuracy-snapshot.ts +++ /dev/null @@ -1,54 +0,0 @@ -import fs from "fs/promises"; -import path from "path"; -import { z } from "zod"; - -export const SNAPSHOT_FILE_PATH = path.resolve(process.cwd(), "accuracy-snapshot.json"); - -export const AccuracySnapshotEntrySchema = z.object({ - datetime: z.string(), - commit: z.string(), - model: z.string(), - suite: z.string(), - test: z.string(), - toolCallingAccuracy: z.number(), - parameterAccuracy: z.number(), -}); - -export type AccuracySnapshotEntry = z.infer; - -export async function readSnapshot(): Promise { - try { - const raw = await fs.readFile(SNAPSHOT_FILE_PATH, "utf8"); - return AccuracySnapshotEntrySchema.array().parse(JSON.parse(raw)); - } catch (e: unknown) { - if ((e as { code: string }).code === "ENOENT") { - return []; - } - throw e; - } -} - -function waitFor(ms: number) { - return new Promise((resolve) => setTimeout(resolve, ms)); -} - -export async function appendAccuracySnapshot(entry: AccuracySnapshotEntry): Promise { - AccuracySnapshotEntrySchema.parse(entry); - - for (let attempt = 0; attempt < 5; attempt++) { - try { - const snapshot = await readSnapshot(); - snapshot.unshift(entry); - const tmp = `${SNAPSHOT_FILE_PATH}~${Date.now()}`; - await fs.writeFile(tmp, JSON.stringify(snapshot, null, 2)); - await fs.rename(tmp, SNAPSHOT_FILE_PATH); - return; - } catch (e) { - if (attempt < 4) { - await waitFor(100 + Math.random() * 200); - } else { - throw e; - } - } - } -} From ca49d4090a85c11f1b039b59e2a0adeb487120a2 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 11:42:34 +0200 Subject: [PATCH 26/91] wip: snapshot summary generator --- .../accuracy-snapshot-storage/mdb-snapshot-storage.ts | 9 +++++++-- .../sdk/accuracy-snapshot-storage/snapshot-storage.ts | 4 +--- tests/accuracy/sdk/git-info.ts | 5 +++++ 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts index f8296a8a..c93abe12 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts @@ -49,7 +49,12 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { await this.snapshotCollection.insertOne(snapshotWithMeta); } - async getLastRunIdForCommit(commit: string): Promise { + async getLatestSnapshotsForCommit(commit: string): Promise { + const latestRunId = await this.getLastRunIdForCommit(commit); + return latestRunId ? this.getSnapshotEntriesForRunId(latestRunId) : []; + } + + private async getLastRunIdForCommit(commit: string): Promise { const document = await this.snapshotCollection.findOne( { commit: commit }, { sort: { createdOn: -1 }, projection: { accuracyRunId: 1 } } @@ -58,7 +63,7 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { return document?.accuracyRunId ? `${document?.accuracyRunId}` : undefined; } - async getSnapshotEntriesForRunId(accuracyRunId: string): Promise { + private async getSnapshotEntriesForRunId(accuracyRunId: string): Promise { const snapshotEntries = await this.snapshotCollection.find({ accuracyRunId }).toArray(); return AccuracySnapshotEntrySchema.array().parse(snapshotEntries); } diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts index a6f92807..eb0e453f 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts @@ -43,9 +43,7 @@ export interface AccuracySnapshotStorage { > ): Promise; - getLastRunIdForCommit(commit: string): Promise; - - getSnapshotEntriesForRunId(accuracyRunId: string): Promise; + getLatestSnapshotsForCommit(commit: string): Promise; close(): Promise; } diff --git a/tests/accuracy/sdk/git-info.ts b/tests/accuracy/sdk/git-info.ts index 03e34a7d..a0918a6f 100644 --- a/tests/accuracy/sdk/git-info.ts +++ b/tests/accuracy/sdk/git-info.ts @@ -5,3 +5,8 @@ export async function getCommitSHA(): Promise { const lastCommit = commitLogs.latest; return lastCommit?.hash; } + +export async function getMergeBase(targetBranch: string, workBranchOrCommit: string): Promise { + const result = await simpleGit().raw(["merge-base", targetBranch, workBranchOrCommit]); + return result.trim(); +} From 92413df895829f8d7f5b860608cd27e96177ff10 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 13:02:17 +0200 Subject: [PATCH 27/91] chore: single entry point for running accuracy tests with different config --- package.json | 2 +- scripts/run-accuracy-tests.sh | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 scripts/run-accuracy-tests.sh diff --git a/package.json b/package.json index ce6b5c03..205a2bac 100644 --- a/package.json +++ b/package.json @@ -30,7 +30,7 @@ "reformat": "prettier --write .", "generate": "./scripts/generate.sh", "test": "vitest --coverage", - "test:accuracy": "node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern tests/accuracy" + "test:accuracy": "sh ./scripts/run-accuracy-tests.sh" }, "license": "Apache-2.0", "devDependencies": { diff --git a/scripts/run-accuracy-tests.sh b/scripts/run-accuracy-tests.sh new file mode 100644 index 00000000..979f49e1 --- /dev/null +++ b/scripts/run-accuracy-tests.sh @@ -0,0 +1,7 @@ +#!/bin/sh +# Variables necessary for the accuracy test runs +export MDB_ACCURACY_RUN_ID=$(npx uuid v4) + +TEST_PATH_PATTERN="${1:-tests/accuracy}" +shift || true +node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern "$TEST_PATH_PATTERN" "$@" \ No newline at end of file From 8c50ecf5c06a63735d30b443951b729b6ce95f5f Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 13:02:45 +0200 Subject: [PATCH 28/91] chore: reformat --- .../test-data-dumps/comics.books.json | 331 ++++-------------- .../test-data-dumps/comics.characters.json | 298 ++++------------ .../test-data-dumps/mflix.movies.json | 319 ++++------------- .../accuracy/test-data-dumps/mflix.shows.json | 296 ++++------------ 4 files changed, 255 insertions(+), 989 deletions(-) diff --git a/tests/accuracy/test-data-dumps/comics.books.json b/tests/accuracy/test-data-dumps/comics.books.json index 3bcb9ecc..f605f031 100644 --- a/tests/accuracy/test-data-dumps/comics.books.json +++ b/tests/accuracy/test-data-dumps/comics.books.json @@ -5,12 +5,8 @@ "publisher": "Dark Horse Comics", "release_date": "2007-03-02T00:00:00", "issues": 118, - "main_characters": [ - "Stephen Shaw" - ], - "genre": [ - "Sci-Fi" - ] + "main_characters": ["Stephen Shaw"], + "genre": ["Sci-Fi"] }, { "_id": "b2e993fb-2688-4ab0-9512-f8ada5faa948", @@ -18,13 +14,8 @@ "publisher": "Image Comics", "release_date": "1998-12-07T00:00:00", "issues": 137, - "main_characters": [ - "Margaret Hogan" - ], - "genre": [ - "Adventure", - "Horror" - ] + "main_characters": ["Margaret Hogan"], + "genre": ["Adventure", "Horror"] }, { "_id": "f674a05a-12c8-4344-875c-6cd1fcba8f9d", @@ -32,13 +23,8 @@ "publisher": "DC Comics", "release_date": "2012-12-01T00:00:00", "issues": 227, - "main_characters": [ - "Joseph Cook", - "Tammy Bishop" - ], - "genre": [ - "Superhero" - ] + "main_characters": ["Joseph Cook", "Tammy Bishop"], + "genre": ["Superhero"] }, { "_id": "bb72b493-2a61-41d7-9406-dfaf6e51a425", @@ -46,12 +32,8 @@ "publisher": "DC Comics", "release_date": "2011-02-24T00:00:00", "issues": 270, - "main_characters": [ - "Sandra Moss" - ], - "genre": [ - "Fantasy" - ] + "main_characters": ["Sandra Moss"], + "genre": ["Fantasy"] }, { "_id": "ea85131f-dfc8-4997-b3b0-996138185d73", @@ -65,10 +47,7 @@ "Tammy Murphy", "Larry Hensley" ], - "genre": [ - "Adventure", - "Horror" - ] + "genre": ["Adventure", "Horror"] }, { "_id": "fdd56270-eb31-4456-8bf4-df81371eb290", @@ -82,10 +61,7 @@ "Micheal Brown", "Jeremy Rice" ], - "genre": [ - "Fantasy", - "Action" - ] + "genre": ["Fantasy", "Action"] }, { "_id": "6de66ba4-3975-4055-824c-cda5caf517d2", @@ -93,15 +69,8 @@ "publisher": "Marvel Comics", "release_date": "2007-11-19T00:00:00", "issues": 55, - "main_characters": [ - "Joseph Bowman", - "Robert Logan", - "Ashley Watkins" - ], - "genre": [ - "Sci-Fi", - "Horror" - ] + "main_characters": ["Joseph Bowman", "Robert Logan", "Ashley Watkins"], + "genre": ["Sci-Fi", "Horror"] }, { "_id": "e3cafdbf-e97a-47c9-a848-bdd82e12f8f7", @@ -115,10 +84,7 @@ "Lindsay Anderson", "Scott Garcia" ], - "genre": [ - "Action", - "Horror" - ] + "genre": ["Action", "Horror"] }, { "_id": "547190cd-5c9e-44c5-b8f9-afeefd039001", @@ -126,12 +92,8 @@ "publisher": "Marvel Comics", "release_date": "1987-04-16T00:00:00", "issues": 235, - "main_characters": [ - "Julie Goodwin" - ], - "genre": [ - "Sci-Fi" - ] + "main_characters": ["Julie Goodwin"], + "genre": ["Sci-Fi"] }, { "_id": "ba3d82f7-8edc-408c-8212-c0d6634624ee", @@ -139,14 +101,8 @@ "publisher": "Dark Horse Comics", "release_date": "1979-09-13T00:00:00", "issues": 239, - "main_characters": [ - "Chad Pham", - "Lindsay Anderson", - "Carlos Burton" - ], - "genre": [ - "Adventure" - ] + "main_characters": ["Chad Pham", "Lindsay Anderson", "Carlos Burton"], + "genre": ["Adventure"] }, { "_id": "a6bc8677-22ab-415a-bfe2-731a9f887cb9", @@ -154,15 +110,8 @@ "publisher": "Marvel Comics", "release_date": "2023-10-01T00:00:00", "issues": 163, - "main_characters": [ - "Kevin Humphrey", - "Maria Wright", - "Virginia Watts" - ], - "genre": [ - "Fantasy", - "Action" - ] + "main_characters": ["Kevin Humphrey", "Maria Wright", "Virginia Watts"], + "genre": ["Fantasy", "Action"] }, { "_id": "fb986790-df22-4db4-8168-c76e9e9471f8", @@ -170,13 +119,8 @@ "publisher": "IDW Publishing", "release_date": "2016-09-28T00:00:00", "issues": 14, - "main_characters": [ - "Brian Vincent" - ], - "genre": [ - "Sci-Fi", - "Fantasy" - ] + "main_characters": ["Brian Vincent"], + "genre": ["Sci-Fi", "Fantasy"] }, { "_id": "700aa115-dc5a-4be6-b275-bfb943c95ee0", @@ -184,12 +128,8 @@ "publisher": "Image Comics", "release_date": "1970-04-16T00:00:00", "issues": 5, - "main_characters": [ - "Joseph Cook" - ], - "genre": [ - "Fantasy" - ] + "main_characters": ["Joseph Cook"], + "genre": ["Fantasy"] }, { "_id": "7959187e-9693-43a1-ae2d-c168431fceb2", @@ -197,15 +137,8 @@ "publisher": "IDW Publishing", "release_date": "2019-02-15T00:00:00", "issues": 121, - "main_characters": [ - "Angelica Stein", - "Benjamin Morris", - "Jeremy Rice" - ], - "genre": [ - "Fantasy", - "Action" - ] + "main_characters": ["Angelica Stein", "Benjamin Morris", "Jeremy Rice"], + "genre": ["Fantasy", "Action"] }, { "_id": "d6018445-5149-42e7-9d87-eb1b181ce20c", @@ -219,9 +152,7 @@ "Carlos Burton", "Micheal Brown" ], - "genre": [ - "Adventure" - ] + "genre": ["Adventure"] }, { "_id": "055507ff-7a48-4df8-9ba9-7b6c10e11836", @@ -235,10 +166,7 @@ "Holly Green", "James Sanchez" ], - "genre": [ - "Sci-Fi", - "Fantasy" - ] + "genre": ["Sci-Fi", "Fantasy"] }, { "_id": "1add2da3-68e6-48a3-9703-b593c9e0bf2e", @@ -246,14 +174,8 @@ "publisher": "DC Comics", "release_date": "2001-03-01T00:00:00", "issues": 176, - "main_characters": [ - "Justin Martinez", - "Tammy Murphy" - ], - "genre": [ - "Action", - "Fantasy" - ] + "main_characters": ["Justin Martinez", "Tammy Murphy"], + "genre": ["Action", "Fantasy"] }, { "_id": "c0fe2869-eb7d-4f09-a773-028387a54969", @@ -261,14 +183,8 @@ "publisher": "DC Comics", "release_date": "1976-09-05T00:00:00", "issues": 68, - "main_characters": [ - "Christopher Elliott", - "Maria Wright" - ], - "genre": [ - "Superhero", - "Adventure" - ] + "main_characters": ["Christopher Elliott", "Maria Wright"], + "genre": ["Superhero", "Adventure"] }, { "_id": "c2fafbf6-5f71-4f31-9775-803e8c77e467", @@ -282,9 +198,7 @@ "Robert Logan", "Margaret Hogan" ], - "genre": [ - "Adventure" - ] + "genre": ["Adventure"] }, { "_id": "f72be3a7-d4be-40a1-ad66-370b44759047", @@ -292,15 +206,8 @@ "publisher": "Marvel Comics", "release_date": "1976-09-18T00:00:00", "issues": 275, - "main_characters": [ - "Sandra Moss", - "Charles Blair", - "Justin Martinez" - ], - "genre": [ - "Fantasy", - "Action" - ] + "main_characters": ["Sandra Moss", "Charles Blair", "Justin Martinez"], + "genre": ["Fantasy", "Action"] }, { "_id": "da5be16e-13e8-42d5-8954-bd89919395af", @@ -314,10 +221,7 @@ "Cristian Oneal", "Michelle Valdez" ], - "genre": [ - "Horror", - "Fantasy" - ] + "genre": ["Horror", "Fantasy"] }, { "_id": "92afc1e6-f703-4aa7-9866-3b62f2784fec", @@ -325,15 +229,8 @@ "publisher": "Image Comics", "release_date": "2008-07-21T00:00:00", "issues": 109, - "main_characters": [ - "Holly Green", - "Diana Mata", - "Julie Goodwin" - ], - "genre": [ - "Horror", - "Sci-Fi" - ] + "main_characters": ["Holly Green", "Diana Mata", "Julie Goodwin"], + "genre": ["Horror", "Sci-Fi"] }, { "_id": "fec61fdd-bddb-431a-b14a-d81601a47cf8", @@ -341,13 +238,8 @@ "publisher": "DC Comics", "release_date": "2012-04-27T00:00:00", "issues": 297, - "main_characters": [ - "Joshua Hicks" - ], - "genre": [ - "Action", - "Horror" - ] + "main_characters": ["Joshua Hicks"], + "genre": ["Action", "Horror"] }, { "_id": "9d37d0d7-1adc-4f54-8790-30f13472520c", @@ -355,15 +247,8 @@ "publisher": "Image Comics", "release_date": "1996-02-20T00:00:00", "issues": 295, - "main_characters": [ - "Margaret Hogan", - "Christopher Elliott", - "Joseph Cook" - ], - "genre": [ - "Fantasy", - "Adventure" - ] + "main_characters": ["Margaret Hogan", "Christopher Elliott", "Joseph Cook"], + "genre": ["Fantasy", "Adventure"] }, { "_id": "338a83ad-06fc-42e1-a605-60a192ce5643", @@ -377,9 +262,7 @@ "Julie Goodwin", "Charles Blair" ], - "genre": [ - "Action" - ] + "genre": ["Action"] }, { "_id": "5b07c17b-4df9-4b72-9c3e-b51d93def1fb", @@ -387,13 +270,8 @@ "publisher": "IDW Publishing", "release_date": "2024-06-19T00:00:00", "issues": 259, - "main_characters": [ - "Debbie Green" - ], - "genre": [ - "Sci-Fi", - "Superhero" - ] + "main_characters": ["Debbie Green"], + "genre": ["Sci-Fi", "Superhero"] }, { "_id": "625b11a5-bb45-4837-9cd6-50bfe2e3396c", @@ -407,9 +285,7 @@ "Brian Vincent", "Sandra Moss" ], - "genre": [ - "Adventure" - ] + "genre": ["Adventure"] }, { "_id": "71b845f3-4416-430a-81eb-8c208f824365", @@ -423,10 +299,7 @@ "Holly Green", "Joseph Bowman" ], - "genre": [ - "Superhero", - "Fantasy" - ] + "genre": ["Superhero", "Fantasy"] }, { "_id": "14dbf3a6-d258-4c96-8883-336b60bc2112", @@ -434,13 +307,8 @@ "publisher": "DC Comics", "release_date": "1969-11-30T00:00:00", "issues": 104, - "main_characters": [ - "Micheal Brown" - ], - "genre": [ - "Horror", - "Superhero" - ] + "main_characters": ["Micheal Brown"], + "genre": ["Horror", "Superhero"] }, { "_id": "091e16d8-d50c-4e7d-9b3a-545cf2596738", @@ -448,12 +316,8 @@ "publisher": "Image Comics", "release_date": "1990-01-24T00:00:00", "issues": 74, - "main_characters": [ - "Robert Logan" - ], - "genre": [ - "Sci-Fi" - ] + "main_characters": ["Robert Logan"], + "genre": ["Sci-Fi"] }, { "_id": "c47ec96a-4d6e-43ea-9bb5-00e4c8058b53", @@ -461,15 +325,8 @@ "publisher": "DC Comics", "release_date": "1971-04-21T00:00:00", "issues": 135, - "main_characters": [ - "Jeremy Rice", - "Elizabeth Robinson", - "James Sanchez" - ], - "genre": [ - "Action", - "Sci-Fi" - ] + "main_characters": ["Jeremy Rice", "Elizabeth Robinson", "James Sanchez"], + "genre": ["Action", "Sci-Fi"] }, { "_id": "d446a8ca-5d01-4be9-a061-027ef1f7bfc6", @@ -477,15 +334,8 @@ "publisher": "Dark Horse Comics", "release_date": "1984-06-24T00:00:00", "issues": 111, - "main_characters": [ - "Joshua Hicks", - "Jeremy Rice", - "Micheal Brown" - ], - "genre": [ - "Fantasy", - "Superhero" - ] + "main_characters": ["Joshua Hicks", "Jeremy Rice", "Micheal Brown"], + "genre": ["Fantasy", "Superhero"] }, { "_id": "09c734ff-2bf0-4cb6-bd42-4232209c00c9", @@ -493,14 +343,8 @@ "publisher": "DC Comics", "release_date": "2013-05-22T00:00:00", "issues": 13, - "main_characters": [ - "Luis Callahan", - "Tammy Bishop", - "Cynthia Brown" - ], - "genre": [ - "Action" - ] + "main_characters": ["Luis Callahan", "Tammy Bishop", "Cynthia Brown"], + "genre": ["Action"] }, { "_id": "691034fa-ad52-413e-96a2-a9a319fffe7b", @@ -508,12 +352,8 @@ "publisher": "DC Comics", "release_date": "2021-12-03T00:00:00", "issues": 129, - "main_characters": [ - "Margaret Hogan" - ], - "genre": [ - "Action" - ] + "main_characters": ["Margaret Hogan"], + "genre": ["Action"] }, { "_id": "07942b5a-f7c4-4fc1-bdeb-7eb46b0d57f8", @@ -521,13 +361,8 @@ "publisher": "Dark Horse Comics", "release_date": "2001-08-02T00:00:00", "issues": 38, - "main_characters": [ - "James Sanchez", - "Larry Hensley" - ], - "genre": [ - "Superhero" - ] + "main_characters": ["James Sanchez", "Larry Hensley"], + "genre": ["Superhero"] }, { "_id": "05d637ed-3942-4276-a885-7b3363dd48e2", @@ -535,13 +370,8 @@ "publisher": "Image Comics", "release_date": "2005-03-30T00:00:00", "issues": 150, - "main_characters": [ - "Carlos Burton" - ], - "genre": [ - "Superhero", - "Fantasy" - ] + "main_characters": ["Carlos Burton"], + "genre": ["Superhero", "Fantasy"] }, { "_id": "88904f06-50a6-44f1-bccc-f379a9788611", @@ -549,13 +379,8 @@ "publisher": "Image Comics", "release_date": "2021-06-27T00:00:00", "issues": 262, - "main_characters": [ - "Luis Callahan" - ], - "genre": [ - "Sci-Fi", - "Superhero" - ] + "main_characters": ["Luis Callahan"], + "genre": ["Sci-Fi", "Superhero"] }, { "_id": "fc961fd6-2ec6-43e5-beae-7f58a6c25d9c", @@ -563,14 +388,8 @@ "publisher": "IDW Publishing", "release_date": "1969-06-03T00:00:00", "issues": 264, - "main_characters": [ - "Scott Garcia", - "Joseph Bowman" - ], - "genre": [ - "Fantasy", - "Superhero" - ] + "main_characters": ["Scott Garcia", "Joseph Bowman"], + "genre": ["Fantasy", "Superhero"] }, { "_id": "481a3ea6-9629-4fe6-8a5a-eba846f0e62c", @@ -584,10 +403,7 @@ "Benjamin Morris", "Virginia Watts" ], - "genre": [ - "Adventure", - "Action" - ] + "genre": ["Adventure", "Action"] }, { "_id": "6bab6bcd-2f6b-4dfb-a030-d63b32fc6250", @@ -595,14 +411,7 @@ "publisher": "IDW Publishing", "release_date": "2007-12-27T00:00:00", "issues": 117, - "main_characters": [ - "Debbie Green", - "Christopher Elliott", - "Joshua Hicks" - ], - "genre": [ - "Sci-Fi", - "Action" - ] + "main_characters": ["Debbie Green", "Christopher Elliott", "Joshua Hicks"], + "genre": ["Sci-Fi", "Action"] } -] \ No newline at end of file +] diff --git a/tests/accuracy/test-data-dumps/comics.characters.json b/tests/accuracy/test-data-dumps/comics.characters.json index 944c33d5..4a255f48 100644 --- a/tests/accuracy/test-data-dumps/comics.characters.json +++ b/tests/accuracy/test-data-dumps/comics.characters.json @@ -3,16 +3,9 @@ "_id": "d7047787-abea-40fa-b78e-939925fd3589", "name": "Elizabeth Robinson", "alias": "ashley62", - "powers": [ - "Shapeshifting", - "Telepathy", - "Flight" - ], + "powers": ["Shapeshifting", "Telepathy", "Flight"], "first_appearance": "1961-06-23T00:00:00", - "affiliations": [ - "Fantastic Four", - "X-Men" - ], + "affiliations": ["Fantastic Four", "X-Men"], "origin": "Earth", "is_villain": false }, @@ -20,14 +13,9 @@ "_id": "06ac8173-51a6-404c-8f9a-628de889b1de", "name": "Joshua Wang", "alias": "paulasmith", - "powers": [ - "Telekinesis" - ], + "powers": ["Telekinesis"], "first_appearance": "1987-04-16T00:00:00", - "affiliations": [ - "Fantastic Four", - "Justice League" - ], + "affiliations": ["Fantastic Four", "Justice League"], "origin": "Earth", "is_villain": true }, @@ -35,10 +23,7 @@ "_id": "252c203a-0271-4ee7-a3d9-34c9f922b959", "name": "Stephen Shaw", "alias": "adamskenneth", - "powers": [ - "Super Speed", - "Flight" - ], + "powers": ["Super Speed", "Flight"], "first_appearance": "2004-07-26T00:00:00", "affiliations": [], "origin": "Atlantis", @@ -48,14 +33,9 @@ "_id": "bf5b7d04-fe71-4969-84a3-0eb9ed5d2197", "name": "Joseph Bowman", "alias": "amysalazar", - "powers": [ - "Time Manipulation" - ], + "powers": ["Time Manipulation"], "first_appearance": "1961-07-03T00:00:00", - "affiliations": [ - "Teen Titans", - "Avengers" - ], + "affiliations": ["Teen Titans", "Avengers"], "origin": "Atlantis", "is_villain": true }, @@ -63,10 +43,7 @@ "_id": "c6271161-bd78-4338-b6ca-88d91f7b853e", "name": "Debbie Green", "alias": "steventodd", - "powers": [ - "Energy Blasts", - "Regeneration" - ], + "powers": ["Energy Blasts", "Regeneration"], "first_appearance": "2021-12-05T00:00:00", "affiliations": [], "origin": "Asgard", @@ -76,11 +53,7 @@ "_id": "60223f4c-5908-4f82-a2a3-a5dad1771f7f", "name": "Christopher Elliott", "alias": "barajasmitchell", - "powers": [ - "Flight", - "Invisibility", - "Telekinesis" - ], + "powers": ["Flight", "Invisibility", "Telekinesis"], "first_appearance": "1947-03-23T00:00:00", "affiliations": [], "origin": "Earth", @@ -90,10 +63,7 @@ "_id": "f66a8f7a-9ca3-431a-9ece-aba96be18220", "name": "Tammy Murphy", "alias": "jessicagill", - "powers": [ - "Super Strength", - "Telekinesis" - ], + "powers": ["Super Strength", "Telekinesis"], "first_appearance": "2000-07-06T00:00:00", "affiliations": [], "origin": "Mutant", @@ -103,10 +73,7 @@ "_id": "817c0b11-3eac-4a3a-b55f-203126db060f", "name": "Scott Garcia", "alias": "whitechristie", - "powers": [ - "Telepathy", - "Energy Blasts" - ], + "powers": ["Telepathy", "Energy Blasts"], "first_appearance": "2000-11-22T00:00:00", "affiliations": [], "origin": "Asgard", @@ -116,14 +83,9 @@ "_id": "1ee6789f-d774-43b8-87e2-9f6dbac6230a", "name": "Julie Goodwin", "alias": "robertsmith", - "powers": [ - "Telepathy", - "Super Speed" - ], + "powers": ["Telepathy", "Super Speed"], "first_appearance": "1953-08-09T00:00:00", - "affiliations": [ - "Teen Titans" - ], + "affiliations": ["Teen Titans"], "origin": "Mutant", "is_villain": true }, @@ -131,11 +93,7 @@ "_id": "3ab9b55d-94ab-449e-bda9-63b2c633494a", "name": "Joshua Hicks", "alias": "cynthia32", - "powers": [ - "Super Strength", - "Invisibility", - "Telekinesis" - ], + "powers": ["Super Strength", "Invisibility", "Telekinesis"], "first_appearance": "1967-07-17T00:00:00", "affiliations": [], "origin": "Krypton", @@ -145,14 +103,9 @@ "_id": "51adf385-1f8e-4290-bcc6-ce2808dc461e", "name": "Justin Martinez", "alias": "janicebrown", - "powers": [ - "Super Speed", - "Super Strength" - ], + "powers": ["Super Speed", "Super Strength"], "first_appearance": "1973-09-19T00:00:00", - "affiliations": [ - "Avengers" - ], + "affiliations": ["Avengers"], "origin": "Mutant", "is_villain": true }, @@ -160,10 +113,7 @@ "_id": "3a3d934e-f5bb-4238-b8a5-74669a937a14", "name": "Holly Green", "alias": "ystanley", - "powers": [ - "Shapeshifting", - "Energy Blasts" - ], + "powers": ["Shapeshifting", "Energy Blasts"], "first_appearance": "2013-08-05T00:00:00", "affiliations": [], "origin": "Krypton", @@ -173,15 +123,9 @@ "_id": "f044b9fb-82c6-48b3-b8b2-806b0be66466", "name": "Margaret Hogan", "alias": "wendyconway", - "powers": [ - "Super Speed", - "Telepathy" - ], + "powers": ["Super Speed", "Telepathy"], "first_appearance": "1944-08-13T00:00:00", - "affiliations": [ - "Justice League", - "X-Men" - ], + "affiliations": ["Justice League", "X-Men"], "origin": "Earth", "is_villain": false }, @@ -189,14 +133,9 @@ "_id": "fd50880a-9d0e-43e1-8b20-2830eba8c7dc", "name": "Ashley Watkins", "alias": "cjohnson", - "powers": [ - "Shapeshifting" - ], + "powers": ["Shapeshifting"], "first_appearance": "1940-09-13T00:00:00", - "affiliations": [ - "Fantastic Four", - "Guardians of the Galaxy" - ], + "affiliations": ["Fantastic Four", "Guardians of the Galaxy"], "origin": "Mutant", "is_villain": true }, @@ -204,14 +143,9 @@ "_id": "68036d6b-1780-4352-98ea-2c68cb5c7bff", "name": "Tammy Bishop", "alias": "geoffreyryan", - "powers": [ - "Regeneration" - ], + "powers": ["Regeneration"], "first_appearance": "1984-11-04T00:00:00", - "affiliations": [ - "Fantastic Four", - "X-Men" - ], + "affiliations": ["Fantastic Four", "X-Men"], "origin": "Earth", "is_villain": true }, @@ -219,14 +153,9 @@ "_id": "dbfa84f2-e598-4e67-99a9-5e8c34e5606f", "name": "Michelle Valdez", "alias": "manuelcobb", - "powers": [ - "Regeneration", - "Energy Blasts" - ], + "powers": ["Regeneration", "Energy Blasts"], "first_appearance": "2014-08-04T00:00:00", - "affiliations": [ - "Teen Titans" - ], + "affiliations": ["Teen Titans"], "origin": "Mutant", "is_villain": false }, @@ -234,10 +163,7 @@ "_id": "ae85885c-13d0-4ae2-b82c-fa53859665d7", "name": "Joseph Cook", "alias": "scott40", - "powers": [ - "Telepathy", - "Telekinesis" - ], + "powers": ["Telepathy", "Telekinesis"], "first_appearance": "1976-04-01T00:00:00", "affiliations": [], "origin": "Earth", @@ -247,9 +173,7 @@ "_id": "0738b98f-4699-4609-9156-fb6a1085a503", "name": "Jeremy Rice", "alias": "james82", - "powers": [ - "Invisibility" - ], + "powers": ["Invisibility"], "first_appearance": "1977-09-22T00:00:00", "affiliations": [], "origin": "Asgard", @@ -259,13 +183,9 @@ "_id": "a072c5df-cc65-4044-ba24-fcc8eaa71b4a", "name": "Chad Pham", "alias": "smithjennifer", - "powers": [ - "Telepathy" - ], + "powers": ["Telepathy"], "first_appearance": "2001-05-26T00:00:00", - "affiliations": [ - "Teen Titans" - ], + "affiliations": ["Teen Titans"], "origin": "Mars", "is_villain": false }, @@ -273,11 +193,7 @@ "_id": "d545ec48-680c-4493-8650-d759bedabb7e", "name": "Diana Mata", "alias": "zwilliamson", - "powers": [ - "Super Speed", - "Energy Blasts", - "Invisibility" - ], + "powers": ["Super Speed", "Energy Blasts", "Invisibility"], "first_appearance": "2010-11-21T00:00:00", "affiliations": [], "origin": "Mars", @@ -287,15 +203,9 @@ "_id": "e6bfb576-d65c-40f8-a547-90719578e03c", "name": "Maria Wright", "alias": "yraymond", - "powers": [ - "Flight", - "Telepathy" - ], + "powers": ["Flight", "Telepathy"], "first_appearance": "1971-04-15T00:00:00", - "affiliations": [ - "Avengers", - "Teen Titans" - ], + "affiliations": ["Avengers", "Teen Titans"], "origin": "Asgard", "is_villain": true }, @@ -303,15 +213,9 @@ "_id": "a2e7b056-0c79-4a2e-83ff-1774b6e186ea", "name": "Carlos Burton", "alias": "rperkins", - "powers": [ - "Super Speed", - "Time Manipulation", - "Telekinesis" - ], + "powers": ["Super Speed", "Time Manipulation", "Telekinesis"], "first_appearance": "1970-01-20T00:00:00", - "affiliations": [ - "Teen Titans" - ], + "affiliations": ["Teen Titans"], "origin": "Mutant", "is_villain": true }, @@ -319,10 +223,7 @@ "_id": "ec7f8d60-3fef-4329-a7d2-6d89805d758c", "name": "Lindsay Anderson", "alias": "amycox", - "powers": [ - "Super Strength", - "Telekinesis" - ], + "powers": ["Super Strength", "Telekinesis"], "first_appearance": "1976-04-30T00:00:00", "affiliations": [], "origin": "Atlantis", @@ -332,16 +233,9 @@ "_id": "cdc66356-a438-4989-b4d1-315609ec6d91", "name": "Larry Hensley", "alias": "ylester", - "powers": [ - "Super Strength", - "Invisibility", - "Shapeshifting" - ], + "powers": ["Super Strength", "Invisibility", "Shapeshifting"], "first_appearance": "2019-01-21T00:00:00", - "affiliations": [ - "Guardians of the Galaxy", - "Avengers" - ], + "affiliations": ["Guardians of the Galaxy", "Avengers"], "origin": "Asgard", "is_villain": false }, @@ -349,10 +243,7 @@ "_id": "0952b684-f887-446f-afcb-71d2ace3fd32", "name": "Sandra Moss", "alias": "alexandra81", - "powers": [ - "Telekinesis", - "Super Speed" - ], + "powers": ["Telekinesis", "Super Speed"], "first_appearance": "1989-07-28T00:00:00", "affiliations": [], "origin": "Earth", @@ -362,14 +253,9 @@ "_id": "9a63c787-3b44-46c2-b927-ffdde6ee10bc", "name": "Cynthia Brown", "alias": "freed", - "powers": [ - "Super Strength", - "Energy Blasts" - ], + "powers": ["Super Strength", "Energy Blasts"], "first_appearance": "2015-06-19T00:00:00", - "affiliations": [ - "Fantastic Four" - ], + "affiliations": ["Fantastic Four"], "origin": "Mars", "is_villain": false }, @@ -377,11 +263,7 @@ "_id": "2b058c3e-e795-4ecd-b5d7-dba6f1a831f6", "name": "Brian Vincent", "alias": "ghowell", - "powers": [ - "Invisibility", - "Flight", - "Super Speed" - ], + "powers": ["Invisibility", "Flight", "Super Speed"], "first_appearance": "2012-05-12T00:00:00", "affiliations": [], "origin": "Asgard", @@ -391,16 +273,9 @@ "_id": "7a1e38ae-0bc6-41dd-ad61-e7542e6e9d4f", "name": "Kevin Humphrey", "alias": "mary44", - "powers": [ - "Super Strength", - "Super Speed", - "Telepathy" - ], + "powers": ["Super Strength", "Super Speed", "Telepathy"], "first_appearance": "1993-05-10T00:00:00", - "affiliations": [ - "Justice League", - "Teen Titans" - ], + "affiliations": ["Justice League", "Teen Titans"], "origin": "Mutant", "is_villain": true }, @@ -408,13 +283,9 @@ "_id": "c147036a-ab66-4023-a950-1fb81acf7dca", "name": "Luis Callahan", "alias": "ashleyreeves", - "powers": [ - "Telekinesis" - ], + "powers": ["Telekinesis"], "first_appearance": "1943-11-02T00:00:00", - "affiliations": [ - "X-Men" - ], + "affiliations": ["X-Men"], "origin": "Krypton", "is_villain": false }, @@ -422,11 +293,7 @@ "_id": "c42cec2b-156d-481e-993b-aa93637ae76e", "name": "Micheal Brown", "alias": "lisa85", - "powers": [ - "Telepathy", - "Flight", - "Time Manipulation" - ], + "powers": ["Telepathy", "Flight", "Time Manipulation"], "first_appearance": "1983-11-04T00:00:00", "affiliations": [], "origin": "Krypton", @@ -436,14 +303,9 @@ "_id": "5bd85192-926b-42f3-bc18-afd40a53753e", "name": "James Sanchez", "alias": "mary95", - "powers": [ - "Energy Blasts", - "Telekinesis" - ], + "powers": ["Energy Blasts", "Telekinesis"], "first_appearance": "1999-05-20T00:00:00", - "affiliations": [ - "Justice League" - ], + "affiliations": ["Justice League"], "origin": "Atlantis", "is_villain": false }, @@ -451,16 +313,9 @@ "_id": "4b41e8f8-2cea-4d50-b7b0-ec59fca45367", "name": "Richard Cooper", "alias": "james85", - "powers": [ - "Telekinesis", - "Energy Blasts", - "Super Speed" - ], + "powers": ["Telekinesis", "Energy Blasts", "Super Speed"], "first_appearance": "2021-11-27T00:00:00", - "affiliations": [ - "Justice League", - "Fantastic Four" - ], + "affiliations": ["Justice League", "Fantastic Four"], "origin": "Mars", "is_villain": true }, @@ -468,9 +323,7 @@ "_id": "8fd8c7b5-fabd-4021-9aeb-114e64ad06e0", "name": "Charles Blair", "alias": "barbara60", - "powers": [ - "Super Strength" - ], + "powers": ["Super Strength"], "first_appearance": "2012-05-03T00:00:00", "affiliations": [], "origin": "Krypton", @@ -480,9 +333,7 @@ "_id": "830eaa54-4397-4344-8964-2abdd7e2d86d", "name": "Virginia Watts", "alias": "klane", - "powers": [ - "Telekinesis" - ], + "powers": ["Telekinesis"], "first_appearance": "2016-04-27T00:00:00", "affiliations": [], "origin": "Earth", @@ -492,9 +343,7 @@ "_id": "495f64a9-123e-46d4-9ddb-21692353a849", "name": "Robert Logan", "alias": "griffinsean", - "powers": [ - "Telepathy" - ], + "powers": ["Telepathy"], "first_appearance": "2003-07-16T00:00:00", "affiliations": [], "origin": "Krypton", @@ -504,10 +353,7 @@ "_id": "e3a96aac-bd9f-49f0-a9ea-efa7d6baf3e9", "name": "Cheyenne Powell", "alias": "laurenolsen", - "powers": [ - "Time Manipulation", - "Energy Blasts" - ], + "powers": ["Time Manipulation", "Energy Blasts"], "first_appearance": "1964-02-05T00:00:00", "affiliations": [], "origin": "Atlantis", @@ -517,16 +363,9 @@ "_id": "2688321c-f5b0-43c8-b95c-060e748ba73b", "name": "Benjamin Morris", "alias": "sierra18", - "powers": [ - "Telekinesis", - "Regeneration", - "Shapeshifting" - ], + "powers": ["Telekinesis", "Regeneration", "Shapeshifting"], "first_appearance": "1964-09-27T00:00:00", - "affiliations": [ - "X-Men", - "Avengers" - ], + "affiliations": ["X-Men", "Avengers"], "origin": "Mars", "is_villain": false }, @@ -534,9 +373,7 @@ "_id": "98c4ca66-c7a7-44ad-ad16-5395905a011e", "name": "Cristian Oneal", "alias": "harrellamy", - "powers": [ - "Super Speed" - ], + "powers": ["Super Speed"], "first_appearance": "1965-01-29T00:00:00", "affiliations": [], "origin": "Mutant", @@ -546,16 +383,9 @@ "_id": "e2999d26-1a93-4355-b04f-44f27a3c7f36", "name": "Jessica Vargas", "alias": "chadherrera", - "powers": [ - "Energy Blasts", - "Super Strength", - "Telekinesis" - ], + "powers": ["Energy Blasts", "Super Strength", "Telekinesis"], "first_appearance": "1974-03-29T00:00:00", - "affiliations": [ - "X-Men", - "Teen Titans" - ], + "affiliations": ["X-Men", "Teen Titans"], "origin": "Earth", "is_villain": true }, @@ -563,14 +393,10 @@ "_id": "f3fa712d-2124-433a-b405-c02757fa1503", "name": "Angelica Stein", "alias": "reedjason", - "powers": [ - "Invisibility" - ], + "powers": ["Invisibility"], "first_appearance": "1981-01-02T00:00:00", - "affiliations": [ - "Avengers" - ], + "affiliations": ["Avengers"], "origin": "Earth", "is_villain": true } -] \ No newline at end of file +] diff --git a/tests/accuracy/test-data-dumps/mflix.movies.json b/tests/accuracy/test-data-dumps/mflix.movies.json index cd35382e..3c492185 100644 --- a/tests/accuracy/test-data-dumps/mflix.movies.json +++ b/tests/accuracy/test-data-dumps/mflix.movies.json @@ -3,16 +3,9 @@ "_id": "bf96c9f7-17be-467c-9f5e-3f19dc2e9ed4", "title": "Human sell", "release_year": 1993, - "genres": [ - "Sci-Fi" - ], + "genres": ["Sci-Fi"], "director": "Christina Collins", - "cast": [ - "Jeremy Marks", - "Matthew Moore", - "Erica Miller", - "Beth Morales" - ], + "cast": ["Jeremy Marks", "Matthew Moore", "Erica Miller", "Beth Morales"], "runtime": 139, "rating": 9.3 }, @@ -20,10 +13,7 @@ "_id": "ab338dcb-c541-4d39-ba3d-58e4ebcac16c", "title": "Trial we much", "release_year": 2020, - "genres": [ - "Horror", - "Comedy" - ], + "genres": ["Horror", "Comedy"], "director": "Steven Miles", "cast": [ "Patrick Huynh", @@ -38,10 +28,7 @@ "_id": "2bd3ed9f-cbeb-4c44-bec7-01d51c3dd7db", "title": "Someone", "release_year": 1996, - "genres": [ - "Action", - "Horror" - ], + "genres": ["Action", "Horror"], "director": "Steven Miles", "cast": [ "Carrie Cummings", @@ -57,9 +44,7 @@ "_id": "fb35d6f3-bda5-450f-8873-56e035e76c42", "title": "Without our", "release_year": 2012, - "genres": [ - "Comedy" - ], + "genres": ["Comedy"], "director": "Christina Collins", "cast": [ "Rodney Gray", @@ -75,16 +60,9 @@ "_id": "4b0d5f7a-c551-4995-aece-a5a585d238a7", "title": "Cost anything", "release_year": 2002, - "genres": [ - "Romance", - "Action" - ], + "genres": ["Romance", "Action"], "director": "Bryan Andrews", - "cast": [ - "Gregory Mullins", - "Jillian Arroyo", - "Angela Reed" - ], + "cast": ["Gregory Mullins", "Jillian Arroyo", "Angela Reed"], "runtime": 112, "rating": 3.8 }, @@ -92,9 +70,7 @@ "_id": "797e4ee5-eff4-45f4-a0d7-40f62f7bd138", "title": "Hold green energy their", "release_year": 1989, - "genres": [ - "Horror" - ], + "genres": ["Horror"], "director": "Christina Collins", "cast": [ "Eduardo Carey", @@ -109,10 +85,7 @@ "_id": "1b81c45b-1d09-47dc-871f-ace109107446", "title": "Choose ability start", "release_year": 1990, - "genres": [ - "Drama", - "Comedy" - ], + "genres": ["Drama", "Comedy"], "director": "Bryan Andrews", "cast": [ "Tyler Daniels", @@ -127,15 +100,9 @@ "_id": "400a08be-f07b-416a-8cdc-46c9886b812b", "title": "Cover perhaps", "release_year": 2022, - "genres": [ - "Drama" - ], + "genres": ["Drama"], "director": "Daniel Wallace", - "cast": [ - "Victoria Price", - "Holly Ross", - "Michele Jones" - ], + "cast": ["Victoria Price", "Holly Ross", "Michele Jones"], "runtime": 173, "rating": 4.3 }, @@ -143,15 +110,9 @@ "_id": "4d4b5420-83e1-4ecd-9c86-238394a1fd0f", "title": "Policy particularly", "release_year": 2003, - "genres": [ - "Comedy" - ], + "genres": ["Comedy"], "director": "Brittany Parker", - "cast": [ - "Emily Haynes", - "Crystal Johnson", - "Ernest Jones" - ], + "cast": ["Emily Haynes", "Crystal Johnson", "Ernest Jones"], "runtime": 154, "rating": 6.6 }, @@ -159,10 +120,7 @@ "_id": "9a489559-ab9d-4dbb-b3e7-d65895b27704", "title": "Store care", "release_year": 2017, - "genres": [ - "Romance", - "Sci-Fi" - ], + "genres": ["Romance", "Sci-Fi"], "director": "Sara Stewart", "cast": [ "Katherine Matthews", @@ -178,10 +136,7 @@ "_id": "99e75e60-6466-4314-92c3-00c433a06600", "title": "Section close bad", "release_year": 2024, - "genres": [ - "Drama", - "Comedy" - ], + "genres": ["Drama", "Comedy"], "director": "Bryan Andrews", "cast": [ "Heather Marshall", @@ -196,16 +151,9 @@ "_id": "726d0c12-4bab-4684-b8e4-5ba795c88273", "title": "Become stand", "release_year": 2001, - "genres": [ - "Sci-Fi", - "Thriller" - ], + "genres": ["Sci-Fi", "Thriller"], "director": "Brian Martinez", - "cast": [ - "Robert Ross", - "Kimberly Williamson", - "Pam Wyatt" - ], + "cast": ["Robert Ross", "Kimberly Williamson", "Pam Wyatt"], "runtime": 162, "rating": 1.5 }, @@ -213,10 +161,7 @@ "_id": "aad23b4b-ddb9-48bd-9b48-b63da1874bb0", "title": "I case", "release_year": 2012, - "genres": [ - "Drama", - "Comedy" - ], + "genres": ["Drama", "Comedy"], "director": "Brittany Parker", "cast": [ "Justin Davis", @@ -231,15 +176,9 @@ "_id": "0d1ce099-18f1-4608-9c5b-5eb8b5870760", "title": "No organization style", "release_year": 2013, - "genres": [ - "Comedy" - ], + "genres": ["Comedy"], "director": "Christina Collins", - "cast": [ - "Benjamin Whitney", - "Joseph Bush", - "Barbara Griffin" - ], + "cast": ["Benjamin Whitney", "Joseph Bush", "Barbara Griffin"], "runtime": 167, "rating": 9.6 }, @@ -247,15 +186,9 @@ "_id": "15855c7b-ece2-4238-b995-57f6207509ea", "title": "Computer garden", "release_year": 2012, - "genres": [ - "Horror" - ], + "genres": ["Horror"], "director": "Steven Miles", - "cast": [ - "Darlene Lee", - "Tina Wang", - "Nathan Mayo" - ], + "cast": ["Darlene Lee", "Tina Wang", "Nathan Mayo"], "runtime": 146, "rating": 6.5 }, @@ -263,15 +196,9 @@ "_id": "e8a6ff98-1e7e-4481-a467-39ebbfc79f67", "title": "Trip information feel", "release_year": 2008, - "genres": [ - "Action", - "Thriller" - ], + "genres": ["Action", "Thriller"], "director": "Brittany Parker", - "cast": [ - "Kelly Walsh", - "Michael Rocha" - ], + "cast": ["Kelly Walsh", "Michael Rocha"], "runtime": 148, "rating": 9.8 }, @@ -279,9 +206,7 @@ "_id": "ef95e7a5-7f73-462e-bd03-c924a8876a7b", "title": "It project low part", "release_year": 1992, - "genres": [ - "Horror" - ], + "genres": ["Horror"], "director": "Christina Collins", "cast": [ "Sheena Murphy", @@ -297,9 +222,7 @@ "_id": "efd2f4f4-1004-4b4e-8bc9-390466a6f77a", "title": "Near attorney discuss", "release_year": 1983, - "genres": [ - "Comedy" - ], + "genres": ["Comedy"], "director": "Christina Collins", "cast": [ "Chase Myers", @@ -314,16 +237,9 @@ "_id": "07f2cb6e-819e-4ff4-b3ba-134d3d9af549", "title": "Whether know", "release_year": 2009, - "genres": [ - "Comedy", - "Thriller" - ], + "genres": ["Comedy", "Thriller"], "director": "Bryan Andrews", - "cast": [ - "Amy Reed", - "William Williams", - "Steven Lawrence" - ], + "cast": ["Amy Reed", "William Williams", "Steven Lawrence"], "runtime": 134, "rating": 9.6 }, @@ -331,10 +247,7 @@ "_id": "ab5948c9-088b-42d6-89d9-42c4603c8b19", "title": "Against place", "release_year": 2017, - "genres": [ - "Drama", - "Romance" - ], + "genres": ["Drama", "Romance"], "director": "Daniel Wallace", "cast": [ "Brittany Thompson", @@ -350,16 +263,9 @@ "_id": "ef7f63fa-b25f-4aea-98e2-d7bdecc26ef5", "title": "Return yard", "release_year": 1994, - "genres": [ - "Horror" - ], + "genres": ["Horror"], "director": "Christina Collins", - "cast": [ - "Mason Lara", - "Taylor Salinas", - "Tim Foster", - "Erin Sharp" - ], + "cast": ["Mason Lara", "Taylor Salinas", "Tim Foster", "Erin Sharp"], "runtime": 99, "rating": 8.8 }, @@ -367,9 +273,7 @@ "_id": "b532e3c8-6292-4f9d-879f-1f070b1a6992", "title": "Certain fish", "release_year": 2009, - "genres": [ - "Romance" - ], + "genres": ["Romance"], "director": "Steven Miles", "cast": [ "Jonathan King", @@ -384,9 +288,7 @@ "_id": "c95e74b0-e47e-4d10-b847-8caa20b94b32", "title": "Agreement like program", "release_year": 2004, - "genres": [ - "Sci-Fi" - ], + "genres": ["Sci-Fi"], "director": "Daniel Jackson", "cast": [ "Ashley Green", @@ -402,14 +304,9 @@ "_id": "791688be-4358-45ab-956e-71fe3fd35d19", "title": "Floor seven then", "release_year": 2009, - "genres": [ - "Horror" - ], + "genres": ["Horror"], "director": "Daniel Wallace", - "cast": [ - "Dustin Wright", - "Crystal Young" - ], + "cast": ["Dustin Wright", "Crystal Young"], "runtime": 143, "rating": 4.8 }, @@ -417,16 +314,9 @@ "_id": "488fd79d-dde6-4462-9b90-339d1f3d7474", "title": "Like rather paper", "release_year": 2006, - "genres": [ - "Drama" - ], + "genres": ["Drama"], "director": "Spencer Gillespie", - "cast": [ - "Sean Moyer", - "James Edwards", - "Tara Lee", - "Robert Scott" - ], + "cast": ["Sean Moyer", "James Edwards", "Tara Lee", "Robert Scott"], "runtime": 175, "rating": 9.1 }, @@ -434,10 +324,7 @@ "_id": "3da68e4d-ef14-4fab-9243-19075262e5ca", "title": "Argue hospital", "release_year": 1994, - "genres": [ - "Romance", - "Sci-Fi" - ], + "genres": ["Romance", "Sci-Fi"], "director": "Amanda Young", "cast": [ "Carolyn Williams", @@ -453,15 +340,9 @@ "_id": "f5206a16-4dca-4c1e-b3aa-0d09f2082601", "title": "Become after card", "release_year": 1986, - "genres": [ - "Sci-Fi", - "Horror" - ], + "genres": ["Sci-Fi", "Horror"], "director": "Brian Martinez", - "cast": [ - "Rhonda Ochoa", - "Charlene Castillo" - ], + "cast": ["Rhonda Ochoa", "Charlene Castillo"], "runtime": 100, "rating": 8.5 }, @@ -469,14 +350,9 @@ "_id": "fbf30e42-ae6d-4775-bb3e-c5c127ddea06", "title": "Born authority attention", "release_year": 1994, - "genres": [ - "Romance" - ], + "genres": ["Romance"], "director": "Brian Martinez", - "cast": [ - "Matthew Thomas", - "Carly Perkins" - ], + "cast": ["Matthew Thomas", "Carly Perkins"], "runtime": 131, "rating": 4.9 }, @@ -484,15 +360,9 @@ "_id": "4b85a220-8a09-46a7-bea3-a2dad8130311", "title": "Local seven media", "release_year": 1998, - "genres": [ - "Sci-Fi", - "Drama" - ], + "genres": ["Sci-Fi", "Drama"], "director": "Amanda Young", - "cast": [ - "Jessica Perez", - "Larry Atkinson" - ], + "cast": ["Jessica Perez", "Larry Atkinson"], "runtime": 95, "rating": 2.0 }, @@ -500,14 +370,9 @@ "_id": "498597d2-3254-46ef-a800-f322a86fbd55", "title": "Keep employee", "release_year": 1981, - "genres": [ - "Horror" - ], + "genres": ["Horror"], "director": "Christina Collins", - "cast": [ - "Alexis Carlson", - "Andrew Stewart" - ], + "cast": ["Alexis Carlson", "Andrew Stewart"], "runtime": 161, "rating": 6.0 }, @@ -515,15 +380,9 @@ "_id": "788d9343-6908-4762-88ee-b04aba1e58b5", "title": "American question generation", "release_year": 1986, - "genres": [ - "Romance" - ], + "genres": ["Romance"], "director": "Daniel Jackson", - "cast": [ - "Troy Carter", - "Peter Hernandez", - "Christine Brown" - ], + "cast": ["Troy Carter", "Peter Hernandez", "Christine Brown"], "runtime": 176, "rating": 8.0 }, @@ -531,16 +390,9 @@ "_id": "74bcf255-df91-40c0-85c0-d7b85ff84f9a", "title": "Maintain out", "release_year": 2000, - "genres": [ - "Sci-Fi", - "Action" - ], + "genres": ["Sci-Fi", "Action"], "director": "Brian Martinez", - "cast": [ - "Nancy Evans", - "Michael Gill", - "Justin Carroll" - ], + "cast": ["Nancy Evans", "Michael Gill", "Justin Carroll"], "runtime": 179, "rating": 10.0 }, @@ -548,10 +400,7 @@ "_id": "61ddf1d4-17b7-4c63-9bf4-5315e740dc7f", "title": "Ten box study", "release_year": 2011, - "genres": [ - "Horror", - "Romance" - ], + "genres": ["Horror", "Romance"], "director": "Steven Miles", "cast": [ "Mark Hicks", @@ -566,17 +415,9 @@ "_id": "ab7d8067-f0ff-4955-bc0c-baca4e56e9a4", "title": "Production operation", "release_year": 2014, - "genres": [ - "Horror", - "Romance" - ], + "genres": ["Horror", "Romance"], "director": "Sara Stewart", - "cast": [ - "Ashley Mata", - "Mark Kelly", - "John West", - "Harold Day" - ], + "cast": ["Ashley Mata", "Mark Kelly", "John West", "Harold Day"], "runtime": 125, "rating": 4.1 }, @@ -584,9 +425,7 @@ "_id": "ccd27288-a496-447d-b01c-1f0b42edcc92", "title": "What language", "release_year": 2004, - "genres": [ - "Sci-Fi" - ], + "genres": ["Sci-Fi"], "director": "Sara Stewart", "cast": [ "Scott Mckenzie", @@ -602,16 +441,9 @@ "_id": "b32dd176-938b-4ded-823a-311423fdc2ea", "title": "Up usually central", "release_year": 2011, - "genres": [ - "Sci-Fi", - "Comedy" - ], + "genres": ["Sci-Fi", "Comedy"], "director": "Daniel Jackson", - "cast": [ - "Jennifer Carlson", - "Jonathan Stewart DDS", - "Amy Lester" - ], + "cast": ["Jennifer Carlson", "Jonathan Stewart DDS", "Amy Lester"], "runtime": 159, "rating": 5.6 }, @@ -619,17 +451,9 @@ "_id": "4aa5f384-3a05-49ff-aa9d-a0e4256c422f", "title": "For boy only", "release_year": 1987, - "genres": [ - "Thriller", - "Action" - ], + "genres": ["Thriller", "Action"], "director": "Sara Stewart", - "cast": [ - "Gene Smith", - "Robert Osborne Jr.", - "Laura Fox", - "Alexis Lowe" - ], + "cast": ["Gene Smith", "Robert Osborne Jr.", "Laura Fox", "Alexis Lowe"], "runtime": 95, "rating": 3.6 }, @@ -637,9 +461,7 @@ "_id": "1c858ca4-d6e9-435c-8e25-d8b05a4e825c", "title": "Site win including your", "release_year": 2008, - "genres": [ - "Sci-Fi" - ], + "genres": ["Sci-Fi"], "director": "Spencer Gillespie", "cast": [ "John Williams", @@ -655,15 +477,9 @@ "_id": "bc5e5766-e998-4ec2-a40c-62ce5d39b972", "title": "Sell huge hair", "release_year": 1997, - "genres": [ - "Thriller", - "Action" - ], + "genres": ["Thriller", "Action"], "director": "Bryan Andrews", - "cast": [ - "Thomas Johnson", - "Ryan Morrow" - ], + "cast": ["Thomas Johnson", "Ryan Morrow"], "runtime": 157, "rating": 4.4 }, @@ -671,17 +487,10 @@ "_id": "090215c8-29e8-4d38-ae9b-ceb78408b982", "title": "Guy rest", "release_year": 1997, - "genres": [ - "Sci-Fi", - "Horror" - ], + "genres": ["Sci-Fi", "Horror"], "director": "Steven Miles", - "cast": [ - "Michael Fox", - "Tyler Acosta", - "Tracy Adams" - ], + "cast": ["Michael Fox", "Tyler Acosta", "Tracy Adams"], "runtime": 122, "rating": 7.8 } -] \ No newline at end of file +] diff --git a/tests/accuracy/test-data-dumps/mflix.shows.json b/tests/accuracy/test-data-dumps/mflix.shows.json index e91c26bb..2edc7fa7 100644 --- a/tests/accuracy/test-data-dumps/mflix.shows.json +++ b/tests/accuracy/test-data-dumps/mflix.shows.json @@ -5,9 +5,7 @@ "seasons": 8, "episodes": 62, "platform": "Amazon Prime", - "genres": [ - "Comedy" - ], + "genres": ["Comedy"], "cast": [ "Roger Gomez", "Sandra Williams", @@ -25,14 +23,8 @@ "seasons": 4, "episodes": 108, "platform": "Hulu", - "genres": [ - "Thriller" - ], - "cast": [ - "Joseph Holmes", - "Patrick Smith", - "Charles Delacruz" - ], + "genres": ["Thriller"], + "cast": ["Joseph Holmes", "Patrick Smith", "Charles Delacruz"], "start_year": 2001, "end_year": null }, @@ -42,10 +34,7 @@ "seasons": 6, "episodes": 49, "platform": "HBO", - "genres": [ - "Comedy", - "Documentary" - ], + "genres": ["Comedy", "Documentary"], "cast": [ "Jason Castillo", "Jessica Burke", @@ -62,15 +51,8 @@ "seasons": 5, "episodes": 23, "platform": "Amazon Prime", - "genres": [ - "Comedy", - "Thriller" - ], - "cast": [ - "Mark Allen", - "Anthony Snyder", - "Kimberly Jones" - ], + "genres": ["Comedy", "Thriller"], + "cast": ["Mark Allen", "Anthony Snyder", "Kimberly Jones"], "start_year": 2002, "end_year": null }, @@ -80,16 +62,8 @@ "seasons": 1, "episodes": 12, "platform": "Amazon Prime", - "genres": [ - "Crime", - "Documentary" - ], - "cast": [ - "Matthew Green", - "Kelly Wright", - "Tonya Sullivan", - "Daniel Brown" - ], + "genres": ["Crime", "Documentary"], + "cast": ["Matthew Green", "Kelly Wright", "Tonya Sullivan", "Daniel Brown"], "start_year": 2009, "end_year": 2020 }, @@ -99,14 +73,8 @@ "seasons": 10, "episodes": 76, "platform": "Amazon Prime", - "genres": [ - "Drama" - ], - "cast": [ - "Stacey Shaw", - "Zachary Steele", - "Laurie Martinez" - ], + "genres": ["Drama"], + "cast": ["Stacey Shaw", "Zachary Steele", "Laurie Martinez"], "start_year": 2011, "end_year": 2020 }, @@ -116,15 +84,8 @@ "seasons": 5, "episodes": 73, "platform": "HBO", - "genres": [ - "Thriller" - ], - "cast": [ - "Diane Boyd", - "Anna Rubio", - "Cheryl Fisher", - "Tyler Villa" - ], + "genres": ["Thriller"], + "cast": ["Diane Boyd", "Anna Rubio", "Cheryl Fisher", "Tyler Villa"], "start_year": 2008, "end_year": 2020 }, @@ -134,9 +95,7 @@ "seasons": 2, "episodes": 114, "platform": "Amazon Prime", - "genres": [ - "Fantasy" - ], + "genres": ["Fantasy"], "cast": [ "Kathleen Marshall", "Kimberly Quinn", @@ -154,9 +113,7 @@ "seasons": 3, "episodes": 55, "platform": "Disney+", - "genres": [ - "Drama" - ], + "genres": ["Drama"], "cast": [ "Barbara Clark", "Carolyn Scott", @@ -173,16 +130,8 @@ "seasons": 4, "episodes": 61, "platform": "Amazon Prime", - "genres": [ - "Comedy", - "Fantasy" - ], - "cast": [ - "Adam Lin", - "Evan Smith", - "Christine Howard", - "Ruben Hopkins" - ], + "genres": ["Comedy", "Fantasy"], + "cast": ["Adam Lin", "Evan Smith", "Christine Howard", "Ruben Hopkins"], "start_year": 2006, "end_year": 2023 }, @@ -192,9 +141,7 @@ "seasons": 1, "episodes": 90, "platform": "HBO", - "genres": [ - "Comedy" - ], + "genres": ["Comedy"], "cast": [ "Eric Ryan", "Ashley Ball", @@ -211,10 +158,7 @@ "seasons": 10, "episodes": 69, "platform": "Hulu", - "genres": [ - "Documentary", - "Fantasy" - ], + "genres": ["Documentary", "Fantasy"], "cast": [ "Mrs. Olivia Booth", "William Murphy", @@ -232,14 +176,8 @@ "seasons": 3, "episodes": 89, "platform": "Disney+", - "genres": [ - "Crime" - ], - "cast": [ - "Elizabeth Lambert", - "Corey Hughes", - "Melissa Stephens" - ], + "genres": ["Crime"], + "cast": ["Elizabeth Lambert", "Corey Hughes", "Melissa Stephens"], "start_year": 2006, "end_year": null }, @@ -249,15 +187,8 @@ "seasons": 9, "episodes": 73, "platform": "Disney+", - "genres": [ - "Documentary", - "Drama" - ], - "cast": [ - "Shane Richardson", - "Lisa Cooper", - "Samantha Perkins" - ], + "genres": ["Documentary", "Drama"], + "cast": ["Shane Richardson", "Lisa Cooper", "Samantha Perkins"], "start_year": 2008, "end_year": null }, @@ -267,14 +198,8 @@ "seasons": 8, "episodes": 40, "platform": "Netflix", - "genres": [ - "Crime" - ], - "cast": [ - "Patricia Barrett", - "Scott Gonzalez", - "Michaela Johnson" - ], + "genres": ["Crime"], + "cast": ["Patricia Barrett", "Scott Gonzalez", "Michaela Johnson"], "start_year": 2006, "end_year": null }, @@ -284,14 +209,8 @@ "seasons": 8, "episodes": 61, "platform": "Hulu", - "genres": [ - "Drama" - ], - "cast": [ - "Christie Waters", - "Casey Allen", - "Nicole Frank" - ], + "genres": ["Drama"], + "cast": ["Christie Waters", "Casey Allen", "Nicole Frank"], "start_year": 2001, "end_year": 2005 }, @@ -301,9 +220,7 @@ "seasons": 10, "episodes": 89, "platform": "Hulu", - "genres": [ - "Drama" - ], + "genres": ["Drama"], "cast": [ "Pedro Butler", "Christian Hall", @@ -321,9 +238,7 @@ "seasons": 5, "episodes": 11, "platform": "Hulu", - "genres": [ - "Drama" - ], + "genres": ["Drama"], "cast": [ "Deborah Garcia", "Michelle Barajas", @@ -339,10 +254,7 @@ "seasons": 1, "episodes": 29, "platform": "Amazon Prime", - "genres": [ - "Fantasy", - "Documentary" - ], + "genres": ["Fantasy", "Documentary"], "cast": [ "Grace Rodriguez", "Alison Greene", @@ -358,9 +270,7 @@ "seasons": 9, "episodes": 111, "platform": "Disney+", - "genres": [ - "Documentary" - ], + "genres": ["Documentary"], "cast": [ "Emily Irwin", "Olivia Gibson", @@ -376,10 +286,7 @@ "seasons": 8, "episodes": 108, "platform": "Hulu", - "genres": [ - "Drama", - "Crime" - ], + "genres": ["Drama", "Crime"], "cast": [ "Karen Phillips", "Kelly Marsh", @@ -395,10 +302,7 @@ "seasons": 6, "episodes": 66, "platform": "Amazon Prime", - "genres": [ - "Crime", - "Documentary" - ], + "genres": ["Crime", "Documentary"], "cast": [ "Bradley Chavez", "Catherine Horn", @@ -414,15 +318,8 @@ "seasons": 9, "episodes": 22, "platform": "Hulu", - "genres": [ - "Drama" - ], - "cast": [ - "Eric Lee", - "Patrick Estrada", - "Kelsey Brown", - "Jeffrey Lewis" - ], + "genres": ["Drama"], + "cast": ["Eric Lee", "Patrick Estrada", "Kelsey Brown", "Jeffrey Lewis"], "start_year": 2001, "end_year": null }, @@ -432,9 +329,7 @@ "seasons": 5, "episodes": 35, "platform": "Hulu", - "genres": [ - "Crime" - ], + "genres": ["Crime"], "cast": [ "Chad Torres", "Mark Williams", @@ -451,10 +346,7 @@ "seasons": 2, "episodes": 94, "platform": "Netflix", - "genres": [ - "Thriller", - "Fantasy" - ], + "genres": ["Thriller", "Fantasy"], "cast": [ "Catherine Davila", "Jessica James", @@ -471,10 +363,7 @@ "seasons": 2, "episodes": 87, "platform": "Hulu", - "genres": [ - "Drama", - "Fantasy" - ], + "genres": ["Drama", "Fantasy"], "cast": [ "Tiffany Brown", "Christina Morales", @@ -491,14 +380,8 @@ "seasons": 5, "episodes": 56, "platform": "Netflix", - "genres": [ - "Comedy" - ], - "cast": [ - "James Durham", - "Jessica Myers", - "Rachel King" - ], + "genres": ["Comedy"], + "cast": ["James Durham", "Jessica Myers", "Rachel King"], "start_year": 2005, "end_year": null }, @@ -508,10 +391,7 @@ "seasons": 4, "episodes": 99, "platform": "Disney+", - "genres": [ - "Crime", - "Fantasy" - ], + "genres": ["Crime", "Fantasy"], "cast": [ "Robert Foster", "Jill Barton", @@ -527,10 +407,7 @@ "seasons": 9, "episodes": 24, "platform": "Amazon Prime", - "genres": [ - "Drama", - "Crime" - ], + "genres": ["Drama", "Crime"], "cast": [ "Carl Johnson", "Douglas Beck", @@ -548,15 +425,8 @@ "seasons": 10, "episodes": 117, "platform": "HBO", - "genres": [ - "Crime", - "Fantasy" - ], - "cast": [ - "Carol Miller", - "Jennifer Bass", - "Melanie Leblanc" - ], + "genres": ["Crime", "Fantasy"], + "cast": ["Carol Miller", "Jennifer Bass", "Melanie Leblanc"], "start_year": 2002, "end_year": null }, @@ -566,10 +436,7 @@ "seasons": 1, "episodes": 58, "platform": "Hulu", - "genres": [ - "Crime", - "Drama" - ], + "genres": ["Crime", "Drama"], "cast": [ "James Warren", "Kelly Carter", @@ -586,9 +453,7 @@ "seasons": 6, "episodes": 71, "platform": "Netflix", - "genres": [ - "Documentary" - ], + "genres": ["Documentary"], "cast": [ "Sarah Brown", "Patrick Beck", @@ -604,14 +469,8 @@ "seasons": 4, "episodes": 16, "platform": "Hulu", - "genres": [ - "Fantasy" - ], - "cast": [ - "Gabrielle Meyer", - "Madison Matthews", - "Taylor Martinez" - ], + "genres": ["Fantasy"], + "cast": ["Gabrielle Meyer", "Madison Matthews", "Taylor Martinez"], "start_year": 2010, "end_year": null }, @@ -621,14 +480,8 @@ "seasons": 1, "episodes": 79, "platform": "Hulu", - "genres": [ - "Fantasy" - ], - "cast": [ - "Michael Lewis", - "Cassandra Hicks", - "Sydney Garcia" - ], + "genres": ["Fantasy"], + "cast": ["Michael Lewis", "Cassandra Hicks", "Sydney Garcia"], "start_year": 2015, "end_year": 2023 }, @@ -638,16 +491,8 @@ "seasons": 7, "episodes": 82, "platform": "Hulu", - "genres": [ - "Crime", - "Fantasy" - ], - "cast": [ - "Keith Brown", - "Annette Johnson", - "Joseph Carroll", - "Derek Lewis" - ], + "genres": ["Crime", "Fantasy"], + "cast": ["Keith Brown", "Annette Johnson", "Joseph Carroll", "Derek Lewis"], "start_year": 2006, "end_year": 2008 }, @@ -657,10 +502,7 @@ "seasons": 2, "episodes": 52, "platform": "Amazon Prime", - "genres": [ - "Fantasy", - "Drama" - ], + "genres": ["Fantasy", "Drama"], "cast": [ "Garrett Mcgrath", "Craig Jackson", @@ -676,16 +518,8 @@ "seasons": 1, "episodes": 113, "platform": "Netflix", - "genres": [ - "Thriller", - "Comedy" - ], - "cast": [ - "Matthew Hill", - "Andrew White", - "Grant Young", - "John Mathews" - ], + "genres": ["Thriller", "Comedy"], + "cast": ["Matthew Hill", "Andrew White", "Grant Young", "John Mathews"], "start_year": 2015, "end_year": 2020 }, @@ -695,9 +529,7 @@ "seasons": 3, "episodes": 40, "platform": "Netflix", - "genres": [ - "Comedy" - ], + "genres": ["Comedy"], "cast": [ "Matthew Gordon", "Mark Allen", @@ -715,10 +547,7 @@ "seasons": 10, "episodes": 106, "platform": "HBO", - "genres": [ - "Fantasy", - "Drama" - ], + "genres": ["Fantasy", "Drama"], "cast": [ "Elizabeth Taylor", "Melissa Mullins", @@ -735,16 +564,9 @@ "seasons": 3, "episodes": 88, "platform": "HBO", - "genres": [ - "Thriller", - "Drama" - ], - "cast": [ - "Amy Aguilar", - "James Williams", - "Kevin Kirby" - ], + "genres": ["Thriller", "Drama"], + "cast": ["Amy Aguilar", "James Williams", "Kevin Kirby"], "start_year": 2010, "end_year": 2025 } -] \ No newline at end of file +] From 8c8a25b2223868cb5289ebbe0a83167555b29385 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 13:05:24 +0200 Subject: [PATCH 29/91] chore: lint fixes --- tests/accuracy/delete-many.test.ts | 1 - tests/accuracy/sdk/accuracy-scorers.ts | 1 - 2 files changed, 2 deletions(-) diff --git a/tests/accuracy/delete-many.test.ts b/tests/accuracy/delete-many.test.ts index 4d50169d..f9c03740 100644 --- a/tests/accuracy/delete-many.test.ts +++ b/tests/accuracy/delete-many.test.ts @@ -1,7 +1,6 @@ import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { deleteManyResponse } from "../../src/tools/mongodb/delete/deleteMany.js"; function callsDeleteManyWithEmptyFilters(prompt: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/sdk/accuracy-scorers.ts b/tests/accuracy/sdk/accuracy-scorers.ts index 7bd8b969..fd692ac9 100644 --- a/tests/accuracy/sdk/accuracy-scorers.ts +++ b/tests/accuracy/sdk/accuracy-scorers.ts @@ -129,6 +129,5 @@ function compareParams(expected: unknown, actual: unknown): number { return minScore; } - // eslint-disable-next-line eqeqeq return expected == actual ? 1 : 0; } From ebe14d5534b68fd04218ecace7a488064f112c02 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 16:06:21 +0200 Subject: [PATCH 30/91] chore: simplified toolCallingAccuracy calculation --- package-lock.json | 7 + tests/accuracy/sdk/accuracy-scorers.ts | 155 +++++------------- .../mdb-snapshot-storage.ts | 4 +- .../snapshot-storage.ts | 20 ++- tests/accuracy/sdk/accuracy-testing-client.ts | 6 +- tests/accuracy/sdk/describe-accuracy-tests.ts | 18 +- tests/accuracy/sdk/models.ts | 33 +++- 7 files changed, 107 insertions(+), 136 deletions(-) diff --git a/package-lock.json b/package-lock.json index 235bc95d..b5405ad5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9606,6 +9606,13 @@ "node": ">= 0.6" } }, + "node_modules/microdiff": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/microdiff/-/microdiff-1.5.0.tgz", + "integrity": "sha512-Drq+/THMvDdzRYrK0oxJmOKiC24ayUV8ahrt8l3oRK51PWt6gdtrIGrlIH3pT/lFh1z93FbAcidtsHcWbnRz8Q==", + "dev": true, + "license": "MIT" + }, "node_modules/micromatch": { "version": "4.0.8", "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz", diff --git a/tests/accuracy/sdk/accuracy-scorers.ts b/tests/accuracy/sdk/accuracy-scorers.ts index fd692ac9..612c3f80 100644 --- a/tests/accuracy/sdk/accuracy-scorers.ts +++ b/tests/accuracy/sdk/accuracy-scorers.ts @@ -1,133 +1,60 @@ -export type ToolCall = { - toolCallId: string; - toolName: string; - parameters: unknown; -}; -export type ExpectedToolCall = Omit; +import diff from "microdiff"; +import { ExpectedToolCall, ActualToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; -export function toolCallingAccuracyScorer(expectedToolCalls: ExpectedToolCall[], actualToolCalls: ToolCall[]): number { - if (actualToolCalls.length < expectedToolCalls.length) { - return 0; - } - - const possibleScore = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1; - const checkedToolCallIds = new Set(); - for (const expectedToolCall of expectedToolCalls) { - const matchingActualToolCall = actualToolCalls.find( - (actualToolCall) => - actualToolCall.toolName === expectedToolCall.toolName && - !checkedToolCallIds.has(actualToolCall.toolCallId) - ); - - if (!matchingActualToolCall) { - return 0; - } - - checkedToolCallIds.add(matchingActualToolCall.toolCallId); - } - - return possibleScore; -} - -export function parameterMatchingAccuracyScorer( +export function calculateToolCallingAccuracy( expectedToolCalls: ExpectedToolCall[], - actualToolCalls: ToolCall[] + actualToolCalls: ActualToolCall[] ): number { if (expectedToolCalls.length === 0) { - return 1; + return actualToolCalls.length === 0 ? 1 : 0.75; } - const usedActualIndexes = new Set(); - const scores: number[] = []; + const maxAccuracy = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1; + + const individualAccuracies: number[] = []; + const checkedActualToolCallIndexes = new Set(); for (const expectedCall of expectedToolCalls) { - // Find all unmatched actual tool calls with the same tool name const candidates = actualToolCalls .map((call, index) => ({ call, index })) - .filter(({ call, index }) => !usedActualIndexes.has(index) && call.toolName === expectedCall.toolName); - - if (candidates.length === 0) { - scores.push(0); - continue; - } - - // Pick the candidate with the best parameter match - let bestScore = -1; - let bestIndex = -1; - for (const { call, index } of candidates) { - const score = compareParams(expectedCall.parameters, call.parameters); - if (score > bestScore) { - bestScore = score; - bestIndex = index; - } - } - - usedActualIndexes.add(bestIndex); - scores.push(bestScore); - } - - const totalScore = scores.reduce((sum, score) => sum + score, 0); - return totalScore / scores.length; + .filter( + ({ call, index }) => !checkedActualToolCallIndexes.has(index) && call.toolName === expectedCall.toolName + ) + .map(({ call, index }) => ({ + call, + index, + score: compareParams(expectedCall.parameters, call.parameters), + })) + .filter(({ score }) => score >= 0.75) + .sort((a, b) => b.score - a.score); + + const bestMatch = candidates[0]; + if (!bestMatch) { + individualAccuracies.push(0); + } else { + checkedActualToolCallIndexes.add(bestMatch.index); + const individualAccuracy = Math.min(bestMatch.score, maxAccuracy); + individualAccuracies.push(individualAccuracy); + } + } + + return Math.min(...individualAccuracies); } -/** - * Recursively compares expected and actual parameters and returns a score. - * - 1: Perfect match. - * - 0.75: All expected parameters are present and match, but there are extra actual parameters. - * - 0: Missing parameters or mismatched values. - */ -function compareParams(expected: unknown, actual: unknown): number { - if (expected === null || expected === undefined) { - return actual === null || actual === undefined ? 1 : 0; - } - if (actual === null || actual === undefined) { - return 0; - } +function compareParams(expected: Record, actual: Record): number { + const differences = diff(expected, actual); - if (Array.isArray(expected)) { - if (!Array.isArray(actual) || actual.length < expected.length) { - return 0; - } - let minScore = 1; - for (let i = 0; i < expected.length; i++) { - minScore = Math.min(minScore, compareParams(expected[i], actual[i])); - } - if (minScore === 0) { - return 0; - } - if (actual.length > expected.length) { - minScore = Math.min(minScore, 0.75); - } - return minScore; + if (differences.length === 0) { + return 1; } - if (typeof expected === "object") { - if (typeof actual !== "object" || Array.isArray(actual)) { - return 0; - } - const expectedKeys = Object.keys(expected as Record); - const actualKeys = Object.keys(actual as Record); - - let minScore = 1; - for (const key of expectedKeys) { - if (!Object.prototype.hasOwnProperty.call(actual, key)) { - return 0; - } - minScore = Math.min( - minScore, - compareParams((expected as Record)[key], (actual as Record)[key]) - ); - } + const hasOnlyAdditions = differences.every((d) => d.type === "CREATE"); + const hasRemovals = differences.some((d) => d.type === "REMOVE"); + const hasChanges = differences.some((d) => d.type === "CHANGE"); - if (minScore === 0) { - return 0; - } - - if (actualKeys.length > expectedKeys.length) { - minScore = Math.min(minScore, 0.75); - } - return minScore; + if (hasOnlyAdditions && !hasRemovals && !hasChanges) { + return 0.75; } - return expected == actual ? 1 : 0; + return 0; } diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts index c93abe12..48aac4e8 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts @@ -28,11 +28,13 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { async createSnapshotEntry( snapshotEntry: Pick< AccuracySnapshotEntry, + | "provider" | "requestedModel" | "test" | "prompt" | "toolCallingAccuracy" - | "parameterAccuracy" + | "expectedToolCalls" + | "actualToolCalls" | "llmResponseTime" | "tokensUsage" | "respondingModel" diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts index eb0e453f..b254787c 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts @@ -1,16 +1,30 @@ import z from "zod"; +const ExpectedToolCallSchema = z.object({ + toolCallId: z.string(), + toolName: z.string(), + parameters: z.record(z.string(), z.unknown()), +}); + +const ActualToolCallSchema = ExpectedToolCallSchema.omit({ toolCallId: undefined }); + +export type ExpectedToolCall = z.infer; +export type ActualToolCall = z.infer; + export const AccuracySnapshotEntrySchema = z.object({ // Git and meta information for snapshot entries accuracyRunId: z.string(), createdOn: z.number(), commitSHA: z.string(), // Accuracy info + provider: z.string(), requestedModel: z.string(), test: z.string(), prompt: z.string(), toolCallingAccuracy: z.number(), - parameterAccuracy: z.number(), + // debug info for further investigations + expectedToolCalls: ExpectedToolCallSchema.array(), + actualToolCalls: ActualToolCallSchema.array(), llmResponseTime: z.number(), tokensUsage: z .object({ @@ -30,11 +44,13 @@ export interface AccuracySnapshotStorage { createSnapshotEntry( snapshotEntry: Pick< AccuracySnapshotEntry, + | "provider" | "requestedModel" | "test" | "prompt" | "toolCallingAccuracy" - | "parameterAccuracy" + | "expectedToolCalls" + | "actualToolCalls" | "llmResponseTime" | "tokensUsage" | "respondingModel" diff --git a/tests/accuracy/sdk/accuracy-testing-client.ts b/tests/accuracy/sdk/accuracy-testing-client.ts index b12017d7..8c5f27ad 100644 --- a/tests/accuracy/sdk/accuracy-testing-client.ts +++ b/tests/accuracy/sdk/accuracy-testing-client.ts @@ -5,7 +5,7 @@ import { experimental_createMCPClient as createMCPClient, tool as createVercelTo import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; -import { ToolCall } from "./accuracy-scorers.js"; +import { ExpectedToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; const __dirname = fileURLToPath(import.meta.url); const distPath = path.join(__dirname, "..", "..", "..", "..", "dist"); @@ -16,7 +16,7 @@ export type MockedTools = Record; export class AccuracyTestingClient { private mockedTools: MockedTools = {}; - private recordedToolCalls: ToolCall[] = []; + private recordedToolCalls: ExpectedToolCall[] = []; private constructor(private readonly vercelMCPClient: Awaited>) {} async close() { @@ -33,7 +33,7 @@ export class AccuracyTestingClient { this.recordedToolCalls.push({ toolCallId: uuid(), toolName: toolName, - parameters: args, + parameters: args as Record, }); try { const toolResultGeneratorFn = this.mockedTools[toolName]; diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 5670207a..f472c7f2 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -1,10 +1,10 @@ import { TestableModels } from "./models.js"; -import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyScorer } from "./accuracy-scorers.js"; +import { calculateToolCallingAccuracy } from "./accuracy-scorers.js"; import { getVercelToolCallingAgent, VercelAgent } from "./agent.js"; import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/tools/mongodb/mongodbHelpers.js"; import { AccuracyTestingClient, MockedTools } from "./accuracy-testing-client.js"; import { getAccuracySnapshotStorage } from "./accuracy-snapshot-storage/get-snapshot-storage.js"; -import { AccuracySnapshotStorage } from "./accuracy-snapshot-storage/snapshot-storage.js"; +import { AccuracySnapshotStorage, ExpectedToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; export interface AccuracyTestConfig { systemPrompt?: string; @@ -33,7 +33,7 @@ export function describeAccuracyTests( const eachModel = describe.each(models); const eachSuite = describe.each(Object.keys(accuracyTestConfigs)); - eachModel(`$modelName`, function (model) { + eachModel(`$displayName`, function (model) { const mdbIntegration = setupMongoDBIntegrationTest(); const { populateTestData, cleanupTestDatabases } = prepareTestData(mdbIntegration); @@ -72,20 +72,18 @@ export function describeAccuracyTests( const result = await agent.prompt(promptForModel, model, toolsForModel); const timeAfterPrompt = Date.now(); const toolCalls = testMCPClient.getToolCalls(); - const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); - const parameterMatchingAccuracy = parameterMatchingAccuracyScorer( - testConfig.expectedToolCalls, - toolCalls - ); + const toolCallingAccuracy = calculateToolCallingAccuracy(testConfig.expectedToolCalls, toolCalls); const responseTime = timeAfterPrompt - timeBeforePrompt; await accuracySnapshotStorage.createSnapshotEntry({ + provider: model.provider, requestedModel: model.modelName, test: suiteName, prompt: testConfig.prompt, llmResponseTime: responseTime, - toolCallingAccuracy, - parameterAccuracy: parameterMatchingAccuracy, + toolCallingAccuracy: toolCallingAccuracy, + actualToolCalls: toolCalls, + expectedToolCalls: testConfig.expectedToolCalls, ...result, }); }); diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index eb7f4b91..70b80435 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -6,13 +6,21 @@ import { ollama } from "ollama-ai-provider"; export interface Model

{ readonly modelName: string; + readonly provider: string; + readonly displayName: string; isAvailable(): boolean; getModel(): P; } export class OpenAIModel implements Model { + readonly provider = "OpenAI"; + constructor(readonly modelName: string) {} + get displayName(): string { + return `${this.provider} - ${this.modelName}`; + } + isAvailable(): boolean { return !!process.env.MDB_OPEN_AI_API_KEY; } @@ -25,8 +33,14 @@ export class OpenAIModel implements Model { } export class AzureOpenAIModel implements Model { + readonly provider = "Azure"; + constructor(readonly modelName: string) {} + get displayName(): string { + return `${this.provider} - ${this.modelName}`; + } + isAvailable(): boolean { return !!process.env.MDB_AZURE_OPEN_AI_API_KEY && !!process.env.MDB_AZURE_OPEN_AI_API_URL; } @@ -41,8 +55,14 @@ export class AzureOpenAIModel implements Model { } export class GeminiModel implements Model { + readonly provider = "Google"; + constructor(readonly modelName: string) {} + get displayName(): string { + return `${this.provider} - ${this.modelName}`; + } + isAvailable(): boolean { return !!process.env.MDB_GEMINI_API_KEY; } @@ -55,8 +75,14 @@ export class GeminiModel implements Model { } export class OllamaModel implements Model { + readonly provider = "Ollama"; + constructor(readonly modelName: string) {} + get displayName(): string { + return `${this.provider} - ${this.modelName}`; + } + isAvailable(): boolean { return true; } @@ -66,12 +92,7 @@ export class OllamaModel implements Model { } } -const ALL_TESTABLE_MODELS = [ - // new GeminiModel("gemini-2.0-flash"), - // new OpenAIModel("gpt-4o"), - new AzureOpenAIModel("gpt-4o"), - // new OllamaModel("qwen3:1.7b"), -]; +const ALL_TESTABLE_MODELS = [new AzureOpenAIModel("gpt-4o")]; export type TestableModels = ReturnType; From ad316f7e3aba8f6c17090fb11626d45005df0433 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 16:13:58 +0200 Subject: [PATCH 31/91] chore: account for types moved around --- tests/accuracy/collection-storage-size.test.ts | 2 +- tests/accuracy/create-collection.test.ts | 2 +- tests/accuracy/drop-collection.test.ts | 2 +- tests/accuracy/drop-database.test.ts | 2 +- tests/accuracy/logs.test.ts | 2 +- .../sdk/accuracy-snapshot-storage/snapshot-storage.ts | 3 +-- tests/accuracy/sdk/accuracy-testing-client.ts | 4 ++-- 7 files changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/accuracy/collection-storage-size.test.ts b/tests/accuracy/collection-storage-size.test.ts index 751b84d6..dbb458e1 100644 --- a/tests/accuracy/collection-storage-size.test.ts +++ b/tests/accuracy/collection-storage-size.test.ts @@ -1,7 +1,7 @@ import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; +import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function callsCollectionStorageSize(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { return { diff --git a/tests/accuracy/create-collection.test.ts b/tests/accuracy/create-collection.test.ts index ab468a62..d8a6266f 100644 --- a/tests/accuracy/create-collection.test.ts +++ b/tests/accuracy/create-collection.test.ts @@ -1,7 +1,7 @@ import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; +import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function callsCreateCollection(prompt: string, database: string, collection: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/drop-collection.test.ts b/tests/accuracy/drop-collection.test.ts index e51494b7..89f9cb70 100644 --- a/tests/accuracy/drop-collection.test.ts +++ b/tests/accuracy/drop-collection.test.ts @@ -1,7 +1,7 @@ import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; +import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function onlyCallsDropCollection(prompt: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/drop-database.test.ts b/tests/accuracy/drop-database.test.ts index 08ffe640..0518d982 100644 --- a/tests/accuracy/drop-database.test.ts +++ b/tests/accuracy/drop-database.test.ts @@ -1,7 +1,7 @@ import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; +import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function onlyCallsDropDatabase(prompt: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/logs.test.ts b/tests/accuracy/logs.test.ts index afd2a697..4ca148b9 100644 --- a/tests/accuracy/logs.test.ts +++ b/tests/accuracy/logs.test.ts @@ -1,7 +1,7 @@ import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; +import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function callsLogsTool(prompt: string, toolCall: ExpectedToolCall): AccuracyTestConfig { return { diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts index b254787c..2f9c432a 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts @@ -1,12 +1,11 @@ import z from "zod"; const ExpectedToolCallSchema = z.object({ - toolCallId: z.string(), toolName: z.string(), parameters: z.record(z.string(), z.unknown()), }); -const ActualToolCallSchema = ExpectedToolCallSchema.omit({ toolCallId: undefined }); +const ActualToolCallSchema = ExpectedToolCallSchema.extend({ toolCallId: z.string() }); export type ExpectedToolCall = z.infer; export type ActualToolCall = z.infer; diff --git a/tests/accuracy/sdk/accuracy-testing-client.ts b/tests/accuracy/sdk/accuracy-testing-client.ts index 8c5f27ad..4a8ad279 100644 --- a/tests/accuracy/sdk/accuracy-testing-client.ts +++ b/tests/accuracy/sdk/accuracy-testing-client.ts @@ -5,7 +5,7 @@ import { experimental_createMCPClient as createMCPClient, tool as createVercelTo import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; -import { ExpectedToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; +import { ActualToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; const __dirname = fileURLToPath(import.meta.url); const distPath = path.join(__dirname, "..", "..", "..", "..", "dist"); @@ -16,7 +16,7 @@ export type MockedTools = Record; export class AccuracyTestingClient { private mockedTools: MockedTools = {}; - private recordedToolCalls: ExpectedToolCall[] = []; + private recordedToolCalls: ActualToolCall[] = []; private constructor(private readonly vercelMCPClient: Awaited>) {} async close() { From b34f6bc9ca5d237307b2300d5b2a27e5665df98d Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 21:14:01 +0200 Subject: [PATCH 32/91] chore: adds accuracyRunStatus to snapshot entries The new field `accuracyRunStatus` is supposed to help guard against cases where jest might fail in between, maybe due to LLM rate limit errors or something else, and we then have a partially saved state of an accuracy run. With the new field `accuracyRunStatus` we should be able to safely look for last runs where `accuracyRunStatus` is done and have complete state of accuracy snapshot. --- scripts/mark-accuracy-run-finished.ts | 7 ++++++ scripts/run-accuracy-tests.sh | 23 ++++++++++++++++++- .../mdb-snapshot-storage.ts | 15 +++++++++++- .../snapshot-storage.ts | 13 +++++++++-- 4 files changed, 54 insertions(+), 4 deletions(-) create mode 100644 scripts/mark-accuracy-run-finished.ts diff --git a/scripts/mark-accuracy-run-finished.ts b/scripts/mark-accuracy-run-finished.ts new file mode 100644 index 00000000..ad3e3530 --- /dev/null +++ b/scripts/mark-accuracy-run-finished.ts @@ -0,0 +1,7 @@ +import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js"; + +console.time(`Marked accuracy run id - ${process.env.MDB_ACCURACY_RUN_ID} as finished in`); +const storage = await getAccuracySnapshotStorage(); +await storage.accuracyRunFinished(); +await storage.close(); +console.timeEnd(`Marked accuracy run id - ${process.env.MDB_ACCURACY_RUN_ID} as finished in`); diff --git a/scripts/run-accuracy-tests.sh b/scripts/run-accuracy-tests.sh index 979f49e1..20a16591 100644 --- a/scripts/run-accuracy-tests.sh +++ b/scripts/run-accuracy-tests.sh @@ -2,6 +2,27 @@ # Variables necessary for the accuracy test runs export MDB_ACCURACY_RUN_ID=$(npx uuid v4) +# For providing access tokens for different LLM providers +# export MDB_OPEN_AI_API_KEY="" +# export MDB_GEMINI_API_KEY="" +# export MDB_AZURE_OPEN_AI_API_KEY="" +# export MDB_AZURE_OPEN_AI_API_URL="" + +# For providing a mongodb based storage to store accuracy snapshots +# export MDB_ACCURACY_MDB_URL="" +# export MDB_ACCURACY_MDB_DB="" +# export MDB_ACCURACY_MDB_COLLECTION="" + +# By default we run all the tests under tests/accuracy folder unless a path is +# specified in the command line. Such as: +# npm run test:accuracy -- tests/accuracy/some-test.test.ts TEST_PATH_PATTERN="${1:-tests/accuracy}" shift || true -node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern "$TEST_PATH_PATTERN" "$@" \ No newline at end of file +node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern "$TEST_PATH_PATTERN" "$@" + +# Each test run submits an accuracy snapshot entry for each prompt with the +# accuracyRunStatus: "in-progress". When all the tests are done and jest exits +# with an exit code of 0, we can safely mark accuracy run as finished. +if [ $? -eq 0 ]; then + npx tsx scripts/mark-accuracy-run-finished.ts +fi \ No newline at end of file diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts index 48aac4e8..c1e9ec5a 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts @@ -1,5 +1,10 @@ import { Collection, MongoClient } from "mongodb"; -import { AccuracySnapshotEntry, AccuracySnapshotEntrySchema, AccuracySnapshotStorage } from "./snapshot-storage.js"; +import { + AccuracyRunStatus, + AccuracySnapshotEntry, + AccuracySnapshotEntrySchema, + AccuracySnapshotStorage, +} from "./snapshot-storage.js"; export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { private readonly client: MongoClient; @@ -46,6 +51,7 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { ...snapshotEntry, commitSHA: this.commitSHA, accuracyRunId: this.accuracyRunId, + accuracyRunStatus: AccuracyRunStatus.InProgress, createdOn: Date.now(), }; await this.snapshotCollection.insertOne(snapshotWithMeta); @@ -70,6 +76,13 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { return AccuracySnapshotEntrySchema.array().parse(snapshotEntries); } + async accuracyRunFinished(): Promise { + await this.snapshotCollection.updateMany( + { accuracyRunId: this.accuracyRunId }, + { $set: { accuracyRunStatus: AccuracyRunStatus.Done } } + ); + } + static getStorage(commitSHA: string, accuracyRunId: string): MongoDBSnapshotStorage { const mongodbUrl = process.env.MDB_ACCURACY_MDB_URL; const database = process.env.MDB_ACCURACY_MDB_DB; diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts index 2f9c432a..f77c4d79 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts @@ -4,15 +4,22 @@ const ExpectedToolCallSchema = z.object({ toolName: z.string(), parameters: z.record(z.string(), z.unknown()), }); +export type ExpectedToolCall = z.infer; const ActualToolCallSchema = ExpectedToolCallSchema.extend({ toolCallId: z.string() }); - -export type ExpectedToolCall = z.infer; export type ActualToolCall = z.infer; +export const AccuracyRunStatus = { + Done: "done", + InProgress: "in-progress", +} as const; + export const AccuracySnapshotEntrySchema = z.object({ // Git and meta information for snapshot entries accuracyRunId: z.string(), + accuracyRunStatus: z + .enum([AccuracyRunStatus.Done, AccuracyRunStatus.InProgress]) + .default(AccuracyRunStatus.InProgress), createdOn: z.number(), commitSHA: z.string(), // Accuracy info @@ -60,5 +67,7 @@ export interface AccuracySnapshotStorage { getLatestSnapshotsForCommit(commit: string): Promise; + accuracyRunFinished(): Promise; + close(): Promise; } From 815952d4770868bfce6910af72e63a70b9007b22 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 23:35:23 +0200 Subject: [PATCH 33/91] chore: add disk based accuracy storage for local runs --- .gitignore | 1 + .../disk-snapshot-storage.ts | 122 ++++++++++++++++++ .../get-snapshot-storage.ts | 6 +- .../mdb-snapshot-storage.ts | 18 +-- 4 files changed, 137 insertions(+), 10 deletions(-) create mode 100644 tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts diff --git a/.gitignore b/.gitignore index 4e3f7a54..2ac1f762 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ state.json tests/tmp coverage +.accuracy-snapshots diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts new file mode 100644 index 00000000..668e130a --- /dev/null +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts @@ -0,0 +1,122 @@ +import path from "path"; +import fs from "fs/promises"; +import { fileURLToPath } from "url"; +import { + AccuracyRunStatus, + AccuracySnapshotEntry, + AccuracySnapshotEntrySchema, + AccuracySnapshotStorage, +} from "./snapshot-storage.js"; +const __dirname = fileURLToPath(import.meta.url); +const rootDir = path.resolve(__dirname, "..", "..", "..", "..", ".."); +const snapshotsDir = path.resolve(rootDir, ".accuracy-snapshots"); +export const snapshotFilePath = path.resolve(snapshotsDir, "snapshots.json"); + +export class DiskSnapshotStorage implements AccuracySnapshotStorage { + private constructor( + private readonly accuracyRunId: string, + private readonly commitSHA: string + ) {} + + async createSnapshotEntry( + snapshotEntry: Pick< + AccuracySnapshotEntry, + | "provider" + | "requestedModel" + | "test" + | "prompt" + | "toolCallingAccuracy" + | "expectedToolCalls" + | "actualToolCalls" + | "llmResponseTime" + | "tokensUsage" + | "respondingModel" + | "text" + | "messages" + > + ): Promise { + const snapshotWithMeta: AccuracySnapshotEntry = { + ...snapshotEntry, + commitSHA: this.commitSHA, + accuracyRunId: this.accuracyRunId, + accuracyRunStatus: AccuracyRunStatus.InProgress, + createdOn: Date.now(), + }; + + await this.appendAccuracySnapshot(snapshotWithMeta); + } + + async getLatestSnapshotsForCommit(commit: string): Promise { + const snapshot = await this.readSnapshot(); + const entries = snapshot + .filter((entry) => { + return entry.commitSHA === commit && entry.accuracyRunStatus === AccuracyRunStatus.Done; + }) + .sort((a, b) => b.createdOn - a.createdOn); + const latestRunId = entries[0]?.accuracyRunId; + return latestRunId ? snapshot.filter((entry) => entry.accuracyRunId === latestRunId) : []; + } + + async accuracyRunFinished(): Promise { + const snapshot = await this.readSnapshot(); + const updatedSnapshot = snapshot.map((entry) => { + if (entry.accuracyRunId === this.accuracyRunId) { + return { + ...entry, + accuracyRunStatus: AccuracyRunStatus.Done, + }; + } + + return entry; + }); + await this.writeSnapshot(updatedSnapshot); + } + + close(): Promise { + return Promise.resolve(); + } + + private async appendAccuracySnapshot(entry: AccuracySnapshotEntry): Promise { + for (let attempt = 0; attempt < 5; attempt++) { + try { + const snapshot = await this.readSnapshot(); + snapshot.unshift(entry); + await this.writeSnapshot(snapshot); + return; + } catch (e) { + if (attempt < 4) { + await this.waitFor(100 + Math.random() * 200); + } else { + throw e; + } + } + } + } + + private async writeSnapshot(snapshot: AccuracySnapshotEntry[]): Promise { + const tmp = `${snapshotFilePath}~${Date.now()}`; + await fs.writeFile(tmp, JSON.stringify(snapshot, null, 2)); + await fs.rename(tmp, snapshotFilePath); + } + + private async readSnapshot(): Promise { + try { + const raw = await fs.readFile(snapshotFilePath, "utf8"); + return AccuracySnapshotEntrySchema.array().parse(JSON.parse(raw)); + } catch (e: unknown) { + if ((e as { code: string }).code === "ENOENT") { + return []; + } + throw e; + } + } + + private waitFor(ms: number) { + return new Promise((resolve) => setTimeout(resolve, ms)); + } + + static async getStorage(commitSHA: string, accuracyRunId: string) { + await fs.mkdir(snapshotsDir, { recursive: true }); + return new DiskSnapshotStorage(commitSHA, accuracyRunId); + } +} diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts index 44c8ae3d..020afc79 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts @@ -1,4 +1,5 @@ import { getCommitSHA } from "../git-info.js"; +import { DiskSnapshotStorage } from "./disk-snapshot-storage.js"; import { MongoDBSnapshotStorage } from "./mdb-snapshot-storage.js"; import { AccuracySnapshotStorage } from "./snapshot-storage.js"; @@ -15,5 +16,8 @@ export async function getAccuracySnapshotStorage(): Promise { - const latestRunId = await this.getLastRunIdForCommit(commit); + const latestRunId = await this.getLatestAccuracyRunForCommit(commit); return latestRunId ? this.getSnapshotEntriesForRunId(latestRunId) : []; } - private async getLastRunIdForCommit(commit: string): Promise { + private async getLatestAccuracyRunForCommit(commit: string): Promise { const document = await this.snapshotCollection.findOne( - { commit: commit }, + { commit: commit, accuracyRunStatus: AccuracyRunStatus.Done }, { sort: { createdOn: -1 }, projection: { accuracyRunId: 1 } } ); @@ -83,12 +83,16 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { ); } - static getStorage(commitSHA: string, accuracyRunId: string): MongoDBSnapshotStorage { + async close(): Promise { + await this.client.close(); + } + + static getStorage(commitSHA: string, accuracyRunId: string): MongoDBSnapshotStorage | null { const mongodbUrl = process.env.MDB_ACCURACY_MDB_URL; const database = process.env.MDB_ACCURACY_MDB_DB; const collection = process.env.MDB_ACCURACY_MDB_COLLECTION; if (!mongodbUrl || !database || !collection) { - throw new Error("Cannot create MongoDBAccuracySnapshot storage without relevant configuration provided"); + return null; } return new MongoDBSnapshotStorage({ @@ -99,8 +103,4 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { accuracyRunId, }); } - - async close(): Promise { - await this.client.close(); - } } From 5c99f85b73b59e5cac17858de0af5a678c3d08e7 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 23:40:07 +0200 Subject: [PATCH 34/91] chore: revert changes done to any of the src files --- src/tools/mongodb/create/insertMany.ts | 28 +++---- src/tools/mongodb/delete/deleteMany.ts | 20 ++--- .../mongodb/metadata/collectionSchema.ts | 59 +++++++-------- src/tools/mongodb/metadata/listCollections.ts | 45 +++++------ src/tools/mongodb/metadata/listDatabases.ts | 23 ++---- src/tools/mongodb/read/collectionIndexes.ts | 74 ++++++------------- src/tools/mongodb/read/find.ts | 34 ++++----- 7 files changed, 110 insertions(+), 173 deletions(-) diff --git a/src/tools/mongodb/create/insertMany.ts b/src/tools/mongodb/create/insertMany.ts index e8937825..4744e344 100644 --- a/src/tools/mongodb/create/insertMany.ts +++ b/src/tools/mongodb/create/insertMany.ts @@ -3,21 +3,6 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; -export function insertManyResponse(collection: string, insertedCount: number, insertedIds: unknown[]): CallToolResult { - return { - content: [ - { - text: `Inserted \`${insertedCount}\` document(s) into collection "${collection}"`, - type: "text", - }, - { - text: `Inserted IDs: ${insertedIds.join(", ")}`, - type: "text", - }, - ], - }; -} - export class InsertManyTool extends MongoDBToolBase { public name = "insert-many"; protected description = "Insert an array of documents into a MongoDB collection"; @@ -39,6 +24,17 @@ export class InsertManyTool extends MongoDBToolBase { const provider = await this.ensureConnected(); const result = await provider.insertMany(database, collection, documents); - return insertManyResponse(collection, result.insertedCount, Object.values(result.insertedIds)); + return { + content: [ + { + text: `Inserted \`${result.insertedCount}\` document(s) into collection "${collection}"`, + type: "text", + }, + { + text: `Inserted IDs: ${Object.values(result.insertedIds).join(", ")}`, + type: "text", + }, + ], + }; } } diff --git a/src/tools/mongodb/delete/deleteMany.ts b/src/tools/mongodb/delete/deleteMany.ts index df02094b..aa135512 100644 --- a/src/tools/mongodb/delete/deleteMany.ts +++ b/src/tools/mongodb/delete/deleteMany.ts @@ -4,17 +4,6 @@ import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; import { checkIndexUsage } from "../../../helpers/indexCheck.js"; -export function deleteManyResponse(collection: string, delectedCount: number): CallToolResult { - return { - content: [ - { - text: `Deleted \`${delectedCount}\` document(s) from collection "${collection}"`, - type: "text", - }, - ], - }; -} - export class DeleteManyTool extends MongoDBToolBase { public name = "delete-many"; protected description = "Removes all documents that match the filter from a MongoDB collection"; @@ -56,6 +45,13 @@ export class DeleteManyTool extends MongoDBToolBase { const result = await provider.deleteMany(database, collection, filter); - return deleteManyResponse(collection, result.deletedCount); + return { + content: [ + { + text: `Deleted \`${result.deletedCount}\` document(s) from collection "${collection}"`, + type: "text", + }, + ], + }; } } diff --git a/src/tools/mongodb/metadata/collectionSchema.ts b/src/tools/mongodb/metadata/collectionSchema.ts index 2f419acb..693b8f91 100644 --- a/src/tools/mongodb/metadata/collectionSchema.ts +++ b/src/tools/mongodb/metadata/collectionSchema.ts @@ -1,38 +1,7 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; -import { getSimplifiedSchema, SimplifiedSchema } from "mongodb-schema"; - -export function collectionSchemaResponse( - database: string, - collection: string, - schema: SimplifiedSchema -): CallToolResult { - const fieldsCount = Object.entries(schema).length; - if (fieldsCount === 0) { - return { - content: [ - { - text: `Could not deduce the schema for "${database}.${collection}". This may be because it doesn't exist or is empty.`, - type: "text", - }, - ], - }; - } - - return { - content: [ - { - text: `Found ${fieldsCount} fields in the schema for "${database}.${collection}"`, - type: "text", - }, - { - text: JSON.stringify(schema), - type: "text", - }, - ], - }; -} +import { getSimplifiedSchema } from "mongodb-schema"; export class CollectionSchemaTool extends MongoDBToolBase { public name = "collection-schema"; @@ -45,6 +14,30 @@ export class CollectionSchemaTool extends MongoDBToolBase { const provider = await this.ensureConnected(); const documents = await provider.find(database, collection, {}, { limit: 5 }).toArray(); const schema = await getSimplifiedSchema(documents); - return collectionSchemaResponse(database, collection, schema); + + const fieldsCount = Object.entries(schema).length; + if (fieldsCount === 0) { + return { + content: [ + { + text: `Could not deduce the schema for "${database}.${collection}". This may be because it doesn't exist or is empty.`, + type: "text", + }, + ], + }; + } + + return { + content: [ + { + text: `Found ${fieldsCount} fields in the schema for "${database}.${collection}"`, + type: "text", + }, + { + text: JSON.stringify(schema), + type: "text", + }, + ], + }; } } diff --git a/src/tools/mongodb/metadata/listCollections.ts b/src/tools/mongodb/metadata/listCollections.ts index 5aad19ab..9611d541 100644 --- a/src/tools/mongodb/metadata/listCollections.ts +++ b/src/tools/mongodb/metadata/listCollections.ts @@ -2,28 +2,6 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; -export function listCollectionsResponse(database: string, collections: string[]): CallToolResult { - if (collections.length === 0) { - return { - content: [ - { - type: "text", - text: `No collections found for database "${database}". To create a collection, use the "create-collection" tool.`, - }, - ], - }; - } - - return { - content: collections.map((collection) => { - return { - text: `Name: "${collection}"`, - type: "text", - }; - }), - }; -} - export class ListCollectionsTool extends MongoDBToolBase { public name = "list-collections"; protected description = "List all collections for a given database"; @@ -37,9 +15,24 @@ export class ListCollectionsTool extends MongoDBToolBase { const provider = await this.ensureConnected(); const collections = await provider.listCollections(database); - return listCollectionsResponse( - database, - collections.map((collection) => `${collection.name}`) - ); + if (collections.length === 0) { + return { + content: [ + { + type: "text", + text: `No collections found for database "${database}". To create a collection, use the "create-collection" tool.`, + }, + ], + }; + } + + return { + content: collections.map((collection) => { + return { + text: `Name: "${collection.name}"`, + type: "text", + }; + }), + }; } } diff --git a/src/tools/mongodb/metadata/listDatabases.ts b/src/tools/mongodb/metadata/listDatabases.ts index c1022c5b..400f275b 100644 --- a/src/tools/mongodb/metadata/listDatabases.ts +++ b/src/tools/mongodb/metadata/listDatabases.ts @@ -3,17 +3,6 @@ import { MongoDBToolBase } from "../mongodbTool.js"; import * as bson from "bson"; import { OperationType } from "../../tool.js"; -export function listDatabasesResponse(databases: { name: string; sizeOnDisk: string }[]): CallToolResult { - return { - content: databases.map((db) => { - return { - text: `Name: ${db.name}, Size: ${db.sizeOnDisk} bytes`, - type: "text", - }; - }), - }; -} - export class ListDatabasesTool extends MongoDBToolBase { public name = "list-databases"; protected description = "List all databases for a MongoDB connection"; @@ -24,13 +13,13 @@ export class ListDatabasesTool extends MongoDBToolBase { const provider = await this.ensureConnected(); const dbs = (await provider.listDatabases("")).databases as { name: string; sizeOnDisk: bson.Long }[]; - return listDatabasesResponse( - dbs.map((db) => { + return { + content: dbs.map((db) => { return { - name: db.name, - sizeOnDisk: db.sizeOnDisk.toString(), + text: `Name: ${db.name}, Size: ${db.sizeOnDisk.toString()} bytes`, + type: "text", }; - }) - ); + }), + }; } } diff --git a/src/tools/mongodb/read/collectionIndexes.ts b/src/tools/mongodb/read/collectionIndexes.ts index 7d541128..ef3fa75d 100644 --- a/src/tools/mongodb/read/collectionIndexes.ts +++ b/src/tools/mongodb/read/collectionIndexes.ts @@ -2,44 +2,6 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; -export function collectionIndexesResponse({ - database, - collection, - indexes = [], - namespaceNotFound, -}: { - database: string; - collection: string; - indexes?: { name: string; key: string }[]; - namespaceNotFound?: boolean; -}): CallToolResult { - if (namespaceNotFound) { - return { - content: [ - { - text: `The indexes for "${database}.${collection}" cannot be determined because the collection does not exist.`, - type: "text", - }, - ], - }; - } - - return { - content: [ - { - text: `Found ${indexes.length} indexes in the collection "${collection}":`, - type: "text", - }, - ...(indexes.map((indexDefinition) => { - return { - text: `Name "${indexDefinition.name}", definition: ${JSON.stringify(indexDefinition.key)}`, - type: "text", - }; - }) as { text: string; type: "text" }[]), - ], - }; -} - export class CollectionIndexesTool extends MongoDBToolBase { public name = "collection-indexes"; protected description = "Describe the indexes for a collection"; @@ -49,14 +11,21 @@ export class CollectionIndexesTool extends MongoDBToolBase { protected async execute({ database, collection }: ToolArgs): Promise { const provider = await this.ensureConnected(); const indexes = await provider.getIndexes(database, collection); - return collectionIndexesResponse({ - database, - collection, - indexes: indexes.map((index) => ({ - name: `${index.name}`, - key: JSON.stringify(index.key), - })), - }); + + return { + content: [ + { + text: `Found ${indexes.length} indexes in the collection "${collection}":`, + type: "text", + }, + ...(indexes.map((indexDefinition) => { + return { + text: `Name "${indexDefinition.name}", definition: ${JSON.stringify(indexDefinition.key)}`, + type: "text", + }; + }) as { text: string; type: "text" }[]), + ], + }; } protected handleError( @@ -64,11 +33,14 @@ export class CollectionIndexesTool extends MongoDBToolBase { args: ToolArgs ): Promise | CallToolResult { if (error instanceof Error && "codeName" in error && error.codeName === "NamespaceNotFound") { - return collectionIndexesResponse({ - database: args.database, - collection: args.collection, - namespaceNotFound: true, - }); + return { + content: [ + { + text: `The indexes for "${args.database}.${args.collection}" cannot be determined because the collection does not exist.`, + type: "text", + }, + ], + }; } return super.handleError(error, args); diff --git a/src/tools/mongodb/read/find.ts b/src/tools/mongodb/read/find.ts index e8a40799..02c337ed 100644 --- a/src/tools/mongodb/read/find.ts +++ b/src/tools/mongodb/read/find.ts @@ -22,23 +22,6 @@ export const FindArgs = { .describe("A document, describing the sort order, matching the syntax of the sort argument of cursor.sort()"), }; -export function findResponse(collection: string, documents: unknown[]): CallToolResult { - return { - content: [ - { - text: `Found ${documents.length} documents in the collection "${collection}":`, - type: "text", - }, - ...documents.map<{ type: "text"; text: string }>((doc) => { - return { - text: EJSON.stringify(doc), - type: "text", - }; - }), - ], - }; -} - export class FindTool extends MongoDBToolBase { public name = "find"; protected description = "Run a find query against a MongoDB collection"; @@ -67,6 +50,21 @@ export class FindTool extends MongoDBToolBase { const documents = await provider.find(database, collection, filter, { projection, limit, sort }).toArray(); - return findResponse(collection, documents); + const content: Array<{ text: string; type: "text" }> = [ + { + text: `Found ${documents.length} documents in the collection "${collection}":`, + type: "text", + }, + ...documents.map((doc) => { + return { + text: EJSON.stringify(doc), + type: "text", + } as { text: string; type: "text" }; + }), + ]; + + return { + content, + }; } } From 0d6938aedc05d79f4bf18545bfe7de31a6855218 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Wed, 9 Jul 2025 00:02:41 +0200 Subject: [PATCH 35/91] chore: handle test failures and appropriately mark them as failed in accuracyRunStatus --- scripts/mark-accuracy-run-finished.ts | 20 +++++++++++++--- scripts/run-accuracy-tests.sh | 7 ++++-- tests/accuracy/collection-schema.test.ts | 24 +------------------ .../disk-snapshot-storage.ts | 5 ++-- .../mdb-snapshot-storage.ts | 5 ++-- .../snapshot-storage.ts | 7 ++++-- 6 files changed, 34 insertions(+), 34 deletions(-) diff --git a/scripts/mark-accuracy-run-finished.ts b/scripts/mark-accuracy-run-finished.ts index ad3e3530..8c1a397c 100644 --- a/scripts/mark-accuracy-run-finished.ts +++ b/scripts/mark-accuracy-run-finished.ts @@ -1,7 +1,21 @@ import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js"; +import { + AccuracyRunStatus, + AccuracyRunStatuses, +} from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; -console.time(`Marked accuracy run id - ${process.env.MDB_ACCURACY_RUN_ID} as finished in`); +let status: AccuracyRunStatuses | undefined; +if (process.env.MDB_ACCURACY_RUN_STATUS === "done") { + status = AccuracyRunStatus.Done; +} else if (process.env.MDB_ACCURACY_RUN_STATUS === "failed") { + status = AccuracyRunStatus.Failed; +} else { + console.info(`Unknown status - ${process.env.MDB_ACCURACY_RUN_STATUS}, will not update accuracy run.`); + process.exit(1); +} + +console.time(`Marked accuracy run id - ${process.env.MDB_ACCURACY_RUN_ID} as ${status} in`); const storage = await getAccuracySnapshotStorage(); -await storage.accuracyRunFinished(); +await storage.updateAccuracyRunStatus(status); await storage.close(); -console.timeEnd(`Marked accuracy run id - ${process.env.MDB_ACCURACY_RUN_ID} as finished in`); +console.timeEnd(`Marked accuracy run id - ${process.env.MDB_ACCURACY_RUN_ID} as ${status} in`); diff --git a/scripts/run-accuracy-tests.sh b/scripts/run-accuracy-tests.sh index 20a16591..38d11a99 100644 --- a/scripts/run-accuracy-tests.sh +++ b/scripts/run-accuracy-tests.sh @@ -22,7 +22,10 @@ node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern " # Each test run submits an accuracy snapshot entry for each prompt with the # accuracyRunStatus: "in-progress". When all the tests are done and jest exits -# with an exit code of 0, we can safely mark accuracy run as finished. +# with an exit code of 0, we can safely mark accuracy run as finished otherwise +# failed. if [ $? -eq 0 ]; then - npx tsx scripts/mark-accuracy-run-finished.ts + MDB_ACCURACY_RUN_STATUS="done" npx tsx scripts/mark-accuracy-run-finished.ts +else + MDB_ACCURACY_RUN_STATUS="failed" npx tsx scripts/mark-accuracy-run-finished.ts fi \ No newline at end of file diff --git a/tests/accuracy/collection-schema.test.ts b/tests/accuracy/collection-schema.test.ts index f81273ea..2866e709 100644 --- a/tests/accuracy/collection-schema.test.ts +++ b/tests/accuracy/collection-schema.test.ts @@ -1,34 +1,12 @@ import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { collectionSchemaResponse } from "../../src/tools/mongodb/metadata/collectionSchema.js"; -import { getSimplifiedSchema } from "mongodb-schema"; function callsCollectionSchema(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "collection-schema": async function collectionSchema() { - return collectionSchemaResponse( - "db1", - "coll1", - await getSimplifiedSchema([ - { - name: "Sample name1", - dob: "28.11.2001", - location: "NY", - }, - { - name: "Sample name1", - dob: "28.11.2001", - location: "NY", - title: "Dr.", - }, - ]) - ); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "collection-schema", diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts index 668e130a..58bc396b 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts @@ -3,6 +3,7 @@ import fs from "fs/promises"; import { fileURLToPath } from "url"; import { AccuracyRunStatus, + AccuracyRunStatuses, AccuracySnapshotEntry, AccuracySnapshotEntrySchema, AccuracySnapshotStorage, @@ -57,13 +58,13 @@ export class DiskSnapshotStorage implements AccuracySnapshotStorage { return latestRunId ? snapshot.filter((entry) => entry.accuracyRunId === latestRunId) : []; } - async accuracyRunFinished(): Promise { + async updateAccuracyRunStatus(status: AccuracyRunStatuses) { const snapshot = await this.readSnapshot(); const updatedSnapshot = snapshot.map((entry) => { if (entry.accuracyRunId === this.accuracyRunId) { return { ...entry, - accuracyRunStatus: AccuracyRunStatus.Done, + accuracyRunStatus: status, }; } diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts index bb506ab2..193ba9f9 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts @@ -1,6 +1,7 @@ import { Collection, MongoClient } from "mongodb"; import { AccuracyRunStatus, + AccuracyRunStatuses, AccuracySnapshotEntry, AccuracySnapshotEntrySchema, AccuracySnapshotStorage, @@ -76,10 +77,10 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { return AccuracySnapshotEntrySchema.array().parse(snapshotEntries); } - async accuracyRunFinished(): Promise { + async updateAccuracyRunStatus(status: AccuracyRunStatuses) { await this.snapshotCollection.updateMany( { accuracyRunId: this.accuracyRunId }, - { $set: { accuracyRunStatus: AccuracyRunStatus.Done } } + { $set: { accuracyRunStatus: status } } ); } diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts index f77c4d79..4daf1476 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts @@ -11,14 +11,17 @@ export type ActualToolCall = z.infer; export const AccuracyRunStatus = { Done: "done", + Failed: "failed", InProgress: "in-progress", } as const; +export type AccuracyRunStatuses = (typeof AccuracyRunStatus)[keyof typeof AccuracyRunStatus]; + export const AccuracySnapshotEntrySchema = z.object({ // Git and meta information for snapshot entries accuracyRunId: z.string(), accuracyRunStatus: z - .enum([AccuracyRunStatus.Done, AccuracyRunStatus.InProgress]) + .enum([AccuracyRunStatus.Done, AccuracyRunStatus.Failed, AccuracyRunStatus.InProgress]) .default(AccuracyRunStatus.InProgress), createdOn: z.number(), commitSHA: z.string(), @@ -67,7 +70,7 @@ export interface AccuracySnapshotStorage { getLatestSnapshotsForCommit(commit: string): Promise; - accuracyRunFinished(): Promise; + updateAccuracyRunStatus(status: AccuracyRunStatuses): Promise; close(): Promise; } From cbb137adc3e6476b08e9ad9af26da5c8aacc816d Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Wed, 9 Jul 2025 17:55:05 +0200 Subject: [PATCH 36/91] chore: make snapshot storage independent of accuracyRunId and commitSHA --- scripts/mark-accuracy-run-finished.ts | 21 ------------ scripts/run-accuracy-tests.sh | 12 ++++--- scripts/update-accuracy-run-status.ts | 22 ++++++++++++ .../disk-snapshot-storage.ts | 24 ++++++------- .../get-snapshot-storage.ts | 5 +-- .../mdb-snapshot-storage.ts | 34 +++++++------------ .../snapshot-storage.ts | 8 +++-- tests/accuracy/sdk/describe-accuracy-tests.ts | 17 +++++++++- 8 files changed, 77 insertions(+), 66 deletions(-) delete mode 100644 scripts/mark-accuracy-run-finished.ts create mode 100644 scripts/update-accuracy-run-status.ts diff --git a/scripts/mark-accuracy-run-finished.ts b/scripts/mark-accuracy-run-finished.ts deleted file mode 100644 index 8c1a397c..00000000 --- a/scripts/mark-accuracy-run-finished.ts +++ /dev/null @@ -1,21 +0,0 @@ -import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js"; -import { - AccuracyRunStatus, - AccuracyRunStatuses, -} from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; - -let status: AccuracyRunStatuses | undefined; -if (process.env.MDB_ACCURACY_RUN_STATUS === "done") { - status = AccuracyRunStatus.Done; -} else if (process.env.MDB_ACCURACY_RUN_STATUS === "failed") { - status = AccuracyRunStatus.Failed; -} else { - console.info(`Unknown status - ${process.env.MDB_ACCURACY_RUN_STATUS}, will not update accuracy run.`); - process.exit(1); -} - -console.time(`Marked accuracy run id - ${process.env.MDB_ACCURACY_RUN_ID} as ${status} in`); -const storage = await getAccuracySnapshotStorage(); -await storage.updateAccuracyRunStatus(status); -await storage.close(); -console.timeEnd(`Marked accuracy run id - ${process.env.MDB_ACCURACY_RUN_ID} as ${status} in`); diff --git a/scripts/run-accuracy-tests.sh b/scripts/run-accuracy-tests.sh index 38d11a99..a9a255f2 100644 --- a/scripts/run-accuracy-tests.sh +++ b/scripts/run-accuracy-tests.sh @@ -24,8 +24,12 @@ node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern " # accuracyRunStatus: "in-progress". When all the tests are done and jest exits # with an exit code of 0, we can safely mark accuracy run as finished otherwise # failed. -if [ $? -eq 0 ]; then - MDB_ACCURACY_RUN_STATUS="done" npx tsx scripts/mark-accuracy-run-finished.ts +JEST_EXIT_CODE=$? +if [ $JEST_EXIT_CODE -eq 0 ]; then + MDB_ACCURACY_RUN_STATUS="done" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'done'" else - MDB_ACCURACY_RUN_STATUS="failed" npx tsx scripts/mark-accuracy-run-finished.ts -fi \ No newline at end of file + MDB_ACCURACY_RUN_STATUS="failed" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'failed'" +fi + +# Preserve the original Jest exit code for CI +exit $JEST_EXIT_CODE \ No newline at end of file diff --git a/scripts/update-accuracy-run-status.ts b/scripts/update-accuracy-run-status.ts new file mode 100644 index 00000000..6d1a8bb8 --- /dev/null +++ b/scripts/update-accuracy-run-status.ts @@ -0,0 +1,22 @@ +import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js"; +import { + AccuracyRunStatus, + AccuracyRunStatuses, +} from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; + +const envAccuracyRunId = process.env.MDB_ACCURACY_RUN_ID; +const envAccuracyRunStatus = process.env.MDB_ACCURACY_RUN_STATUS; + +let status: AccuracyRunStatuses | undefined; +if ( + !envAccuracyRunId || + (envAccuracyRunStatus !== AccuracyRunStatus.Done && envAccuracyRunStatus !== AccuracyRunStatus.Failed) +) { + process.exit(1); +} + +console.time(`Marked accuracy run id - ${envAccuracyRunId} as ${status} in`); +const storage = await getAccuracySnapshotStorage(); +await storage.updateAccuracyRunStatus(envAccuracyRunId, envAccuracyRunStatus); +await storage.close(); +console.timeEnd(`Marked accuracy run id - ${envAccuracyRunId} as ${status} in`); diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts index 58bc396b..a4d2bea0 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts @@ -14,14 +14,11 @@ const snapshotsDir = path.resolve(rootDir, ".accuracy-snapshots"); export const snapshotFilePath = path.resolve(snapshotsDir, "snapshots.json"); export class DiskSnapshotStorage implements AccuracySnapshotStorage { - private constructor( - private readonly accuracyRunId: string, - private readonly commitSHA: string - ) {} - async createSnapshotEntry( snapshotEntry: Pick< AccuracySnapshotEntry, + | "accuracyRunId" + | "commitSHA" | "provider" | "requestedModel" | "test" @@ -38,8 +35,6 @@ export class DiskSnapshotStorage implements AccuracySnapshotStorage { ): Promise { const snapshotWithMeta: AccuracySnapshotEntry = { ...snapshotEntry, - commitSHA: this.commitSHA, - accuracyRunId: this.accuracyRunId, accuracyRunStatus: AccuracyRunStatus.InProgress, createdOn: Date.now(), }; @@ -47,7 +42,7 @@ export class DiskSnapshotStorage implements AccuracySnapshotStorage { await this.appendAccuracySnapshot(snapshotWithMeta); } - async getLatestSnapshotsForCommit(commit: string): Promise { + async getLatestSnapshotForCommit(commit: string): Promise { const snapshot = await this.readSnapshot(); const entries = snapshot .filter((entry) => { @@ -58,10 +53,15 @@ export class DiskSnapshotStorage implements AccuracySnapshotStorage { return latestRunId ? snapshot.filter((entry) => entry.accuracyRunId === latestRunId) : []; } - async updateAccuracyRunStatus(status: AccuracyRunStatuses) { + async getSnapshotForAccuracyRun(accuracyRunId: string): Promise { + const snapshot = await this.readSnapshot(); + return snapshot.filter((entry) => entry.accuracyRunId === accuracyRunId); + } + + async updateAccuracyRunStatus(accuracyRunId: string, status: AccuracyRunStatuses) { const snapshot = await this.readSnapshot(); const updatedSnapshot = snapshot.map((entry) => { - if (entry.accuracyRunId === this.accuracyRunId) { + if (entry.accuracyRunId === accuracyRunId) { return { ...entry, accuracyRunStatus: status, @@ -116,8 +116,8 @@ export class DiskSnapshotStorage implements AccuracySnapshotStorage { return new Promise((resolve) => setTimeout(resolve, ms)); } - static async getStorage(commitSHA: string, accuracyRunId: string) { + static async getStorage() { await fs.mkdir(snapshotsDir, { recursive: true }); - return new DiskSnapshotStorage(commitSHA, accuracyRunId); + return new DiskSnapshotStorage(); } } diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts index 020afc79..3bec4c53 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts @@ -16,8 +16,5 @@ export async function getAccuracySnapshotStorage(): Promise { const snapshotWithMeta: AccuracySnapshotEntry = { ...snapshotEntry, - commitSHA: this.commitSHA, - accuracyRunId: this.accuracyRunId, accuracyRunStatus: AccuracyRunStatus.InProgress, createdOn: Date.now(), }; await this.snapshotCollection.insertOne(snapshotWithMeta); } - async getLatestSnapshotsForCommit(commit: string): Promise { + async getLatestSnapshotForCommit(commit: string): Promise { const latestRunId = await this.getLatestAccuracyRunForCommit(commit); - return latestRunId ? this.getSnapshotEntriesForRunId(latestRunId) : []; + return latestRunId ? this.getSnapshotForAccuracyRun(latestRunId) : []; + } + + async getSnapshotForAccuracyRun(accuracyRunId: string): Promise { + const snapshotEntries = await this.snapshotCollection.find({ accuracyRunId }).toArray(); + return AccuracySnapshotEntrySchema.array().parse(snapshotEntries); } private async getLatestAccuracyRunForCommit(commit: string): Promise { @@ -72,14 +69,9 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { return document?.accuracyRunId ? `${document?.accuracyRunId}` : undefined; } - private async getSnapshotEntriesForRunId(accuracyRunId: string): Promise { - const snapshotEntries = await this.snapshotCollection.find({ accuracyRunId }).toArray(); - return AccuracySnapshotEntrySchema.array().parse(snapshotEntries); - } - - async updateAccuracyRunStatus(status: AccuracyRunStatuses) { + async updateAccuracyRunStatus(accuracyRunId: string, status: AccuracyRunStatuses) { await this.snapshotCollection.updateMany( - { accuracyRunId: this.accuracyRunId }, + { accuracyRunId: accuracyRunId }, { $set: { accuracyRunStatus: status } } ); } @@ -88,7 +80,7 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { await this.client.close(); } - static getStorage(commitSHA: string, accuracyRunId: string): MongoDBSnapshotStorage | null { + static getStorage(): MongoDBSnapshotStorage | null { const mongodbUrl = process.env.MDB_ACCURACY_MDB_URL; const database = process.env.MDB_ACCURACY_MDB_DB; const collection = process.env.MDB_ACCURACY_MDB_COLLECTION; @@ -100,8 +92,6 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { mongodbUrl, database, collection, - commitSHA, - accuracyRunId, }); } } diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts index 4daf1476..e7833456 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts @@ -53,6 +53,8 @@ export interface AccuracySnapshotStorage { createSnapshotEntry( snapshotEntry: Pick< AccuracySnapshotEntry, + | "accuracyRunId" + | "commitSHA" | "provider" | "requestedModel" | "test" @@ -68,9 +70,11 @@ export interface AccuracySnapshotStorage { > ): Promise; - getLatestSnapshotsForCommit(commit: string): Promise; + getLatestSnapshotForCommit(commit: string): Promise; - updateAccuracyRunStatus(status: AccuracyRunStatuses): Promise; + getSnapshotForAccuracyRun(accuracyRunId: string): Promise; + + updateAccuracyRunStatus(accuracyRunId: string, status: AccuracyRunStatuses): Promise; close(): Promise; } diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index f472c7f2..1dd6d971 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -5,6 +5,7 @@ import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/ import { AccuracyTestingClient, MockedTools } from "./accuracy-testing-client.js"; import { getAccuracySnapshotStorage } from "./accuracy-snapshot-storage/get-snapshot-storage.js"; import { AccuracySnapshotStorage, ExpectedToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; +import { getCommitSHA } from "./git-info.js"; export interface AccuracyTestConfig { systemPrompt?: string; @@ -26,8 +27,12 @@ export function describeAccuracyTests( [suiteName: string]: AccuracyTestConfig[]; } ) { + if (!process.env.MDB_ACCURACY_RUN_ID) { + throw new Error("MDB_ACCURACY_RUN_ID env variable is required for accuracy test runs!"); + } + if (!models.length) { - throw new Error("No models available to test!"); + throw new Error("No models available to test. Ensure that the API keys are properly setup!"); } const eachModel = describe.each(models); @@ -37,11 +42,19 @@ export function describeAccuracyTests( const mdbIntegration = setupMongoDBIntegrationTest(); const { populateTestData, cleanupTestDatabases } = prepareTestData(mdbIntegration); + const accuracyRunId: string = `${process.env.MDB_ACCURACY_RUN_ID}`; + let commitSHA: string; let accuracySnapshotStorage: AccuracySnapshotStorage; let testMCPClient: AccuracyTestingClient; let agent: VercelAgent; beforeAll(async () => { + const retrievedCommitSHA = await getCommitSHA(); + if (!retrievedCommitSHA) { + throw new Error("Could not derive commitSHA, exiting accuracy tests!"); + } + + commitSHA = retrievedCommitSHA; accuracySnapshotStorage = await getAccuracySnapshotStorage(); testMCPClient = await AccuracyTestingClient.initializeClient(mdbIntegration.connectionString()); agent = getVercelToolCallingAgent(); @@ -76,6 +89,8 @@ export function describeAccuracyTests( const responseTime = timeAfterPrompt - timeBeforePrompt; await accuracySnapshotStorage.createSnapshotEntry({ + accuracyRunId, + commitSHA, provider: model.provider, requestedModel: model.modelName, test: suiteName, From 9321563b786636e5504c84c177e0ce8e2f1718b0 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Wed, 9 Jul 2025 18:18:53 +0200 Subject: [PATCH 37/91] chore: bail on first failure and add some explanation for update-accuracy-status script --- scripts/run-accuracy-tests.sh | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/scripts/run-accuracy-tests.sh b/scripts/run-accuracy-tests.sh index a9a255f2..e009661f 100644 --- a/scripts/run-accuracy-tests.sh +++ b/scripts/run-accuracy-tests.sh @@ -18,18 +18,30 @@ export MDB_ACCURACY_RUN_ID=$(npx uuid v4) # npm run test:accuracy -- tests/accuracy/some-test.test.ts TEST_PATH_PATTERN="${1:-tests/accuracy}" shift || true -node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern "$TEST_PATH_PATTERN" "$@" +node --experimental-vm-modules node_modules/jest/bin/jest.js --bail --testPathPattern "$TEST_PATH_PATTERN" "$@" -# Each test run submits an accuracy snapshot entry for each prompt with the -# accuracyRunStatus: "in-progress". When all the tests are done and jest exits -# with an exit code of 0, we can safely mark accuracy run as finished otherwise -# failed. +# Preserving the exit code from test run to correctly notify in the CI +# environments when the tests fail. JEST_EXIT_CODE=$? + +# Each test run submits an accuracy snapshot entry with the accuracyRunStatus: +# "in-progress". When all the tests are done and jest exits with an exit code of +# 0, we can safely mark accuracy run as finished otherwise failed. + +# This "outside-the-tests-status-update" is arising out of the fact that each +# test suite stores their own accuracy run data in the storage and this setup +# might lead to data inconsistency when the tests fail. To overcome that each +# accuracy snapshot entry has a status which by default is "in-progress" and is +# updated when the tests either pass (all our accuracy tests are supposed to +# pass unless some errors occurs during the test runs), or fail. + +# This is necessary when comparing one accuracy run with another as we wouldn't +# want to compare against an incomplete run. if [ $JEST_EXIT_CODE -eq 0 ]; then MDB_ACCURACY_RUN_STATUS="done" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'done'" else MDB_ACCURACY_RUN_STATUS="failed" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'failed'" fi -# Preserve the original Jest exit code for CI + exit $JEST_EXIT_CODE \ No newline at end of file From f636c3fb12f494e62a60f9d78749af52ceae5f7b Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 10 Jul 2025 00:17:08 +0200 Subject: [PATCH 38/91] chore: refactor to make tests writing simpler and other QOL improvements. 1. Removes unnecessary suite description from tests 2. Removes the test suite name from the storage as well 3. Centralize the constants used everywhere in the SDK 4. Adds clarifying comments and docs wherever necessary 5. Write tests for accuracy-scorer --- .gitignore | 3 +- scripts/update-accuracy-run-status.ts | 10 +- tests/accuracy/aggregate.test.ts | 26 +-- tests/accuracy/collection-indexes.test.ts | 20 +- tests/accuracy/collection-schema.test.ts | 14 +- .../accuracy/collection-storage-size.test.ts | 35 ++- tests/accuracy/count.test.ts | 36 ++-- tests/accuracy/create-collection.test.ts | 48 ++--- tests/accuracy/create-index.test.ts | 28 ++- tests/accuracy/db-stats.test.ts | 10 +- tests/accuracy/delete-many.test.ts | 20 +- tests/accuracy/drop-collection.test.ts | 90 ++++---- tests/accuracy/drop-database.test.ts | 38 ++-- tests/accuracy/explain.test.ts | 32 ++- tests/accuracy/find.test.ts | 58 ++--- tests/accuracy/insert-many.test.ts | 28 +-- tests/accuracy/list-collections.test.ts | 20 +- tests/accuracy/list-databases.test.ts | 16 +- tests/accuracy/logs.test.ts | 36 ++-- tests/accuracy/rename-collection.test.ts | 20 +- tests/accuracy/sdk/accuracy-scorer.ts | 114 ++++++++++ tests/accuracy/sdk/accuracy-scorers.ts | 60 ------ .../disk-snapshot-storage.ts | 16 +- .../get-snapshot-storage.ts | 13 -- .../mdb-snapshot-storage.ts | 1 - .../snapshot-storage.ts | 67 +++++- tests/accuracy/sdk/accuracy-testing-client.ts | 39 ++-- tests/accuracy/sdk/agent.ts | 14 +- tests/accuracy/sdk/constants.ts | 18 ++ tests/accuracy/sdk/describe-accuracy-tests.ts | 112 +++++----- tests/accuracy/sdk/models.ts | 28 ++- tests/accuracy/update-many.test.ts | 26 +-- tests/unit/accuracy-scorer.test.ts | 199 ++++++++++++++++++ 33 files changed, 739 insertions(+), 556 deletions(-) create mode 100644 tests/accuracy/sdk/accuracy-scorer.ts delete mode 100644 tests/accuracy/sdk/accuracy-scorers.ts create mode 100644 tests/accuracy/sdk/constants.ts create mode 100644 tests/unit/accuracy-scorer.test.ts diff --git a/.gitignore b/.gitignore index 2ac1f762..49550e27 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,5 @@ state.json tests/tmp coverage -.accuracy-snapshots +# Generated assets by accuracy runs +.accuracy diff --git a/scripts/update-accuracy-run-status.ts b/scripts/update-accuracy-run-status.ts index 6d1a8bb8..6d8e3895 100644 --- a/scripts/update-accuracy-run-status.ts +++ b/scripts/update-accuracy-run-status.ts @@ -1,13 +1,9 @@ import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js"; -import { - AccuracyRunStatus, - AccuracyRunStatuses, -} from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; +import { AccuracyRunStatus } from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; const envAccuracyRunId = process.env.MDB_ACCURACY_RUN_ID; const envAccuracyRunStatus = process.env.MDB_ACCURACY_RUN_STATUS; -let status: AccuracyRunStatuses | undefined; if ( !envAccuracyRunId || (envAccuracyRunStatus !== AccuracyRunStatus.Done && envAccuracyRunStatus !== AccuracyRunStatus.Failed) @@ -15,8 +11,8 @@ if ( process.exit(1); } -console.time(`Marked accuracy run id - ${envAccuracyRunId} as ${status} in`); +console.time(`Marked accuracy run id - ${envAccuracyRunId} as ${envAccuracyRunStatus} in`); const storage = await getAccuracySnapshotStorage(); await storage.updateAccuracyRunStatus(envAccuracyRunId, envAccuracyRunStatus); await storage.close(); -console.timeEnd(`Marked accuracy run id - ${envAccuracyRunId} as ${status} in`); +console.timeEnd(`Marked accuracy run id - ${envAccuracyRunId} as ${envAccuracyRunStatus} in`); diff --git a/tests/accuracy/aggregate.test.ts b/tests/accuracy/aggregate.test.ts index 3da1ca32..30a5a0e3 100644 --- a/tests/accuracy/aggregate.test.ts +++ b/tests/accuracy/aggregate.test.ts @@ -1,28 +1,16 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -function callsAggregate(prompt: string, pipeline: Record[]): AccuracyTestConfig { - return { - injectConnectedAssumption: true, - prompt: prompt, - mockedTools: {}, +describeAccuracyTests(getAvailableModels(), [ + { + prompt: "Group all the movies in 'mflix.movies' namespace by 'release_year' and give me a count of them", expectedToolCalls: [ { toolName: "aggregate", parameters: { - pipeline: pipeline, + pipeline: { $group: { _id: "$release_year", count: { $sum: 1 } } }, }, }, ], - }; -} - -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should call 'aggregate' tool", [ - callsAggregate( - "Group all the movies in 'mflix.movies' namespace by 'release_year' and give me a count of them", - [{ $group: { _id: "$release_year", count: { $sum: 1 } } }] - ), - ]), -}); + }, +]); diff --git a/tests/accuracy/collection-indexes.test.ts b/tests/accuracy/collection-indexes.test.ts index e53ddb43..dab7d317 100644 --- a/tests/accuracy/collection-indexes.test.ts +++ b/tests/accuracy/collection-indexes.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsCollectionIndexes(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "collection-indexes", @@ -19,12 +17,10 @@ function callsCollectionIndexes(prompt: string): AccuracyTestConfig { }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should call 'collection-indexes' tool", [ - callsCollectionIndexes("How many indexes do I have in 'mflix.movies' namespace?"), - callsCollectionIndexes("List all the indexes in movies collection in mflix database"), - callsCollectionIndexes( - `Is the following query: ${JSON.stringify({ runtime: { $lt: 100 } })} on the namespace 'mflix.movies' indexed?` - ), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsCollectionIndexes("How many indexes do I have in 'mflix.movies' namespace?"), + callsCollectionIndexes("List all the indexes in movies collection in mflix database"), + callsCollectionIndexes( + `Is the following query: ${JSON.stringify({ runtime: { $lt: 100 } })} on the namespace 'mflix.movies' indexed?` + ), +]); diff --git a/tests/accuracy/collection-schema.test.ts b/tests/accuracy/collection-schema.test.ts index 2866e709..f2f22a88 100644 --- a/tests/accuracy/collection-schema.test.ts +++ b/tests/accuracy/collection-schema.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsCollectionSchema(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "collection-schema", @@ -19,9 +17,7 @@ function callsCollectionSchema(prompt: string): AccuracyTestConfig { }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should call 'collection-schema' tool", [ - callsCollectionSchema("Is there a title field in 'db1.coll1' namespace?"), - callsCollectionSchema("What is the type of value stored in title field in coll1 collection in db1 database?"), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsCollectionSchema("Is there a title field in 'db1.coll1' namespace?"), + callsCollectionSchema("What is the type of value stored in title field in coll1 collection in db1 database?"), +]); diff --git a/tests/accuracy/collection-storage-size.test.ts b/tests/accuracy/collection-storage-size.test.ts index dbb458e1..2bd2f021 100644 --- a/tests/accuracy/collection-storage-size.test.ts +++ b/tests/accuracy/collection-storage-size.test.ts @@ -1,20 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; -function callsCollectionStorageSize(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { - return { - injectConnectedAssumption: true, - prompt: prompt, - mockedTools: {}, - expectedToolCalls: expectedToolCalls, - }; -} - -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call 'collection-storage-size' tool", [ - callsCollectionStorageSize("What is the size of 'mflix.movies' namespace", [ +describeAccuracyTests(getAvailableModels(), [ + { + prompt: "What is the size of 'mflix.movies' namespace", + expectedToolCalls: [ { toolName: "collection-storage-size", parameters: { @@ -22,10 +12,11 @@ describeAccuracyTests(getAvailableModels(), { collection: "movies", }, }, - ]), - ]), - ...describeSuite("should call 'collection-storage-size' tool after another tool/s", [ - callsCollectionStorageSize("How much size is each collection in comics database", [ + ], + }, + { + prompt: "How much size is each collection in comics database", + expectedToolCalls: [ { toolName: "list-collections", parameters: { @@ -46,6 +37,6 @@ describeAccuracyTests(getAvailableModels(), { collection: "characters", }, }, - ]), - ]), -}); + ], + }, +]); diff --git a/tests/accuracy/count.test.ts b/tests/accuracy/count.test.ts index 0543af76..09db4678 100644 --- a/tests/accuracy/count.test.ts +++ b/tests/accuracy/count.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsCountToolWithEmptyQuery(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "count", @@ -26,9 +24,7 @@ function callsCountToolWithQuery( query: Record = {} ): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "count", @@ -42,19 +38,17 @@ function callsCountToolWithQuery( }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call 'count' tool", [ - callsCountToolWithEmptyQuery("Count number of documents in 'mflix.movies' namespace."), - callsCountToolWithEmptyQuery( - "How many documents are there in 'characters' collection in 'comics' database?", - "comics", - "characters" - ), - callsCountToolWithQuery( - "Count all the documents in 'mflix.movies' namespace with runtime less than 100?", - "mflix", - "movies", - { runtime: { $lt: 100 } } - ), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsCountToolWithEmptyQuery("Count number of documents in 'mflix.movies' namespace."), + callsCountToolWithEmptyQuery( + "How many documents are there in 'characters' collection in 'comics' database?", + "comics", + "characters" + ), + callsCountToolWithQuery( + "Count all the documents in 'mflix.movies' namespace with runtime less than 100?", + "mflix", + "movies", + { runtime: { $lt: 100 } } + ), +]); diff --git a/tests/accuracy/create-collection.test.ts b/tests/accuracy/create-collection.test.ts index d8a6266f..db7f888c 100644 --- a/tests/accuracy/create-collection.test.ts +++ b/tests/accuracy/create-collection.test.ts @@ -1,13 +1,11 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function callsCreateCollection(prompt: string, database: string, collection: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "create-collection", @@ -29,29 +27,25 @@ function callsCreateCollectionWithListCollections(prompt: string, expectedToolCa }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call 'create-collection' tool", [ - callsCreateCollection("Create a new namespace 'mflix.documentaries'", "mflix", "documentaries"), - callsCreateCollection("Create a new collection villains in comics database", "comics", "villains"), - ]), - ...describeSuite("should call 'create-collection' alongside other required tools", [ - callsCreateCollectionWithListCollections( - "If and only if, the namespace 'mflix.documentaries' does not exist, then create it", - [ - { - toolName: "list-collections", - parameters: { - database: "mflix", - }, +describeAccuracyTests(getAvailableModels(), [ + callsCreateCollection("Create a new namespace 'mflix.documentaries'", "mflix", "documentaries"), + callsCreateCollection("Create a new collection villains in comics database", "comics", "villains"), + callsCreateCollectionWithListCollections( + "If and only if, the namespace 'mflix.documentaries' does not exist, then create it", + [ + { + toolName: "list-collections", + parameters: { + database: "mflix", }, - { - toolName: "create-collection", - parameters: { - database: "mflix", - collection: "documentaries", - }, + }, + { + toolName: "create-collection", + parameters: { + database: "mflix", + collection: "documentaries", }, - ] - ), - ]), -}); + }, + ] + ), +]); diff --git a/tests/accuracy/create-index.test.ts b/tests/accuracy/create-index.test.ts index 82e98e92..6dae12e5 100644 --- a/tests/accuracy/create-index.test.ts +++ b/tests/accuracy/create-index.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsCreateIndex(prompt: string, indexKeys: Record): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "create-index", @@ -20,16 +18,14 @@ function callsCreateIndex(prompt: string, indexKeys: Record): A }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should call 'create-index' tool", [ - callsCreateIndex( - "Create an index that covers the following query on 'mflix.movies' namespace - { \"release_year\": 1992 }", - { - release_year: 1, - } - ), - callsCreateIndex("Create a text index on title field in 'mflix.movies' namespace", { - title: "text", - }), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsCreateIndex( + "Create an index that covers the following query on 'mflix.movies' namespace - { \"release_year\": 1992 }", + { + release_year: 1, + } + ), + callsCreateIndex("Create a text index on title field in 'mflix.movies' namespace", { + title: "text", + }), +]); diff --git a/tests/accuracy/db-stats.test.ts b/tests/accuracy/db-stats.test.ts index b88fbb3c..656eccc2 100644 --- a/tests/accuracy/db-stats.test.ts +++ b/tests/accuracy/db-stats.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsListDatabases(prompt: string, database = "mflix"): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "db-stats", @@ -18,8 +16,4 @@ function callsListDatabases(prompt: string, database = "mflix"): AccuracyTestCon }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call 'db-stats' tool", [ - callsListDatabases("What is the size occupied by database mflix?"), - ]), -}); +describeAccuracyTests(getAvailableModels(), [callsListDatabases("What is the size occupied by database mflix?")]); diff --git a/tests/accuracy/delete-many.test.ts b/tests/accuracy/delete-many.test.ts index f9c03740..c0dd4d51 100644 --- a/tests/accuracy/delete-many.test.ts +++ b/tests/accuracy/delete-many.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsDeleteManyWithEmptyFilters(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "delete-many", @@ -21,9 +19,7 @@ function callsDeleteManyWithEmptyFilters(prompt: string): AccuracyTestConfig { function callsDeleteManyWithFilters(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "delete-many", @@ -37,12 +33,8 @@ function callsDeleteManyWithFilters(prompt: string): AccuracyTestConfig { }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should call 'delete-many' tool", [ - callsDeleteManyWithEmptyFilters("Delete all the documents from 'mflix.movies' namespace"), - callsDeleteManyWithEmptyFilters("Purge the collection 'movies' in database 'mflix'"), - callsDeleteManyWithFilters( - "Remove all the documents from namespace 'mflix.movies' where runtime is less than 100" - ), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsDeleteManyWithEmptyFilters("Delete all the documents from 'mflix.movies' namespace"), + callsDeleteManyWithEmptyFilters("Purge the collection 'movies' in database 'mflix'"), + callsDeleteManyWithFilters("Remove all the documents from namespace 'mflix.movies' where runtime is less than 100"), +]); diff --git a/tests/accuracy/drop-collection.test.ts b/tests/accuracy/drop-collection.test.ts index 89f9cb70..98ba3348 100644 --- a/tests/accuracy/drop-collection.test.ts +++ b/tests/accuracy/drop-collection.test.ts @@ -1,13 +1,11 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function onlyCallsDropCollection(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "drop-collection", @@ -22,61 +20,55 @@ function onlyCallsDropCollection(prompt: string): AccuracyTestConfig { function callsDropCollection(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls, }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call 'drop-collection' tool", [ - onlyCallsDropCollection("Remove mflix.movies namespace from my cluster."), - onlyCallsDropCollection("Drop movies collection from mflix database."), - ]), - ...describeSuite("should call 'drop-collection' after calling other necessary tools", [ - callsDropCollection("Remove books collection from which ever database contains it.", [ - { - toolName: "list-databases", - parameters: {}, +describeAccuracyTests(getAvailableModels(), [ + onlyCallsDropCollection("Remove mflix.movies namespace from my cluster."), + onlyCallsDropCollection("Drop movies collection from mflix database."), + callsDropCollection("Remove books collection from which ever database contains it.", [ + { + toolName: "list-databases", + parameters: {}, + }, + { + toolName: "list-collections", + parameters: { + database: "admin", }, - { - toolName: "list-collections", - parameters: { - database: "admin", - }, + }, + { + toolName: "list-collections", + parameters: { + database: "comics", }, - { - toolName: "list-collections", - parameters: { - database: "comics", - }, + }, + { + toolName: "list-collections", + parameters: { + database: "config", }, - { - toolName: "list-collections", - parameters: { - database: "config", - }, + }, + { + toolName: "list-collections", + parameters: { + database: "local", }, - { - toolName: "list-collections", - parameters: { - database: "local", - }, + }, + { + toolName: "list-collections", + parameters: { + database: "mflix", }, - { - toolName: "list-collections", - parameters: { - database: "mflix", - }, - }, - { - toolName: "drop-collection", - parameters: { - database: "comics", - collection: "books", - }, + }, + { + toolName: "drop-collection", + parameters: { + database: "comics", + collection: "books", }, - ]), + }, ]), -}); +]); diff --git a/tests/accuracy/drop-database.test.ts b/tests/accuracy/drop-database.test.ts index 0518d982..53fc7fd5 100644 --- a/tests/accuracy/drop-database.test.ts +++ b/tests/accuracy/drop-database.test.ts @@ -1,13 +1,11 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function onlyCallsDropDatabase(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "drop-database", @@ -21,30 +19,24 @@ function onlyCallsDropDatabase(prompt: string): AccuracyTestConfig { function callsDropDatabase(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls, }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call 'drop-database' tool", [ - onlyCallsDropDatabase("Remove mflix database from my cluster."), - onlyCallsDropDatabase("Drop database named mflix."), - ]), - ...describeSuite("should call 'drop-database' after calling other necessary tools", [ - callsDropDatabase("If there is a mflix database in my cluster then drop it.", [ - { - toolName: "list-databases", - parameters: {}, - }, - { - toolName: "drop-database", - parameters: { - database: "mflix", - }, +describeAccuracyTests(getAvailableModels(), [ + onlyCallsDropDatabase("Remove mflix database from my cluster."), + onlyCallsDropDatabase("Drop database named mflix."), + callsDropDatabase("If there is a mflix database in my cluster then drop it.", [ + { + toolName: "list-databases", + parameters: {}, + }, + { + toolName: "drop-database", + parameters: { + database: "mflix", }, - ]), + }, ]), -}); +]); diff --git a/tests/accuracy/explain.test.ts b/tests/accuracy/explain.test.ts index 6e767981..4a539c48 100644 --- a/tests/accuracy/explain.test.ts +++ b/tests/accuracy/explain.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsExplain(prompt: string, method: Record): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "explain", @@ -53,20 +51,14 @@ const callsExplainWithCount = (prompt: string) => * because we are using Zod.union, when we probably should've used * Zod.discriminatedUnion */ -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should call 'explain' tool for a find query", [ - callsExplainWithFind( - `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` - ), - ]), - ...describeSuite("should call 'explain' tool for an aggregation", [ - callsExplainWithAggregate( - `Will aggregating documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` - ), - ]), - ...describeSuite("should call 'explain' tool for count", [ - callsExplainWithCount( - `Will counting documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` - ), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsExplainWithFind( + `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` + ), + callsExplainWithAggregate( + `Will aggregating documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` + ), + callsExplainWithCount( + `Will counting documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` + ), +]); diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts index ecfbe4f3..02c02cd1 100644 --- a/tests/accuracy/find.test.ts +++ b/tests/accuracy/find.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsFindNoFilter(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "find", @@ -21,9 +19,7 @@ function callsFindNoFilter(prompt: string, database = "mflix", collection = "mov function callsFindWithFilter(prompt: string, filter: Record): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "find", @@ -39,9 +35,7 @@ function callsFindWithFilter(prompt: string, filter: Record): A function callsFindWithProjection(prompt: string, projection: Record): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "find", @@ -61,9 +55,7 @@ function callsFindWithProjectionAndFilters( projection: Record ): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "find", @@ -85,9 +77,7 @@ function callsFindWithFilterSortAndLimit( limit: number ): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "find", @@ -103,27 +93,25 @@ function callsFindWithFilterSortAndLimit( }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call find tool", [ - callsFindNoFilter("List all the movies in 'mflix.movies' namespace."), - callsFindNoFilter("List all the documents in 'comics.books' namespace.", "comics", "books"), - callsFindWithFilter("Find all the movies in 'mflix.movies' namespace with runtime less than 100.", { - runtime: { $lt: 100 }, - }), - callsFindWithFilter("Find all movies in 'mflix.movies' collection where director is 'Christina Collins'", { - director: "Christina Collins", - }), - callsFindWithProjection("Give me all the movie titles available in 'mflix.movies' namespace", { title: 1 }), - callsFindWithProjectionAndFilters( - "Use 'mflix.movies' namespace to answer who were casted in the movie 'Certain Fish'", - { title: "Certain Fish" }, - { cast: 1 } - ), - callsFindWithFilterSortAndLimit( - "From the mflix.movies namespace, give me first 2 movies of Horror genre sorted ascending by their runtime", - { genres: "Horror" }, - { runtime: 1 }, - 2 - ), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsFindNoFilter("List all the movies in 'mflix.movies' namespace."), + callsFindNoFilter("List all the documents in 'comics.books' namespace.", "comics", "books"), + callsFindWithFilter("Find all the movies in 'mflix.movies' namespace with runtime less than 100.", { + runtime: { $lt: 100 }, + }), + callsFindWithFilter("Find all movies in 'mflix.movies' collection where director is 'Christina Collins'", { + director: "Christina Collins", + }), + callsFindWithProjection("Give me all the movie titles available in 'mflix.movies' namespace", { title: 1 }), + callsFindWithProjectionAndFilters( + "Use 'mflix.movies' namespace to answer who were casted in the movie 'Certain Fish'", + { title: "Certain Fish" }, + { cast: 1 } + ), + callsFindWithFilterSortAndLimit( + "From the mflix.movies namespace, give me first 2 movies of Horror genre sorted ascending by their runtime", + { genres: "Horror" }, + { runtime: 1 }, + 2 + ), +]); diff --git a/tests/accuracy/insert-many.test.ts b/tests/accuracy/insert-many.test.ts index 25d60017..4ce15bb8 100644 --- a/tests/accuracy/insert-many.test.ts +++ b/tests/accuracy/insert-many.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsInsertMany(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "insert-many", @@ -35,9 +33,7 @@ function callsInsertMany(prompt: string): AccuracyTestConfig { function callsEmptyInsertMany(prompt: string) { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "insert-many", @@ -51,15 +47,13 @@ function callsEmptyInsertMany(prompt: string) { }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should call 'insert-many' tool", [ - callsInsertMany( - [ - "In my namespace 'mflix.movies', insert 3 documents each with the following fields:", - "- id: an incremental number starting from 1", - "- name: a string of format 'name'", - ].join("\n") - ), - callsEmptyInsertMany("Add three empty documents in collection 'movies' inside database 'mflix'"), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsInsertMany( + [ + "In my namespace 'mflix.movies', insert 3 documents each with the following fields:", + "- id: an incremental number starting from 1", + "- name: a string of format 'name'", + ].join("\n") + ), + callsEmptyInsertMany("Add three empty documents in collection 'movies' inside database 'mflix'"), +]); diff --git a/tests/accuracy/list-collections.test.ts b/tests/accuracy/list-collections.test.ts index a8455418..78a14f34 100644 --- a/tests/accuracy/list-collections.test.ts +++ b/tests/accuracy/list-collections.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsListCollections(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "list-collections", @@ -50,13 +48,9 @@ function callsListDatabasesAndListCollections(prompt: string): AccuracyTestConfi }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call list-collections tool", [ - callsListCollections("How many collections do I have in database mflix?"), - callsListCollections("List all the collections in my MongoDB database mflix."), - callsListCollections("Is there a shows collection in my MongoDB database mflix?"), - ]), - ...describeSuite("should call list-databases and list-collections tool", [ - callsListDatabasesAndListCollections("List all the collections that I have in total on my cluster?"), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsListCollections("How many collections do I have in database mflix?"), + callsListCollections("List all the collections in my MongoDB database mflix."), + callsListCollections("Is there a shows collection in my MongoDB database mflix?"), + callsListDatabasesAndListCollections("List all the collections that I have in total on my cluster?"), +]); diff --git a/tests/accuracy/list-databases.test.ts b/tests/accuracy/list-databases.test.ts index 0ef88712..97a8ce27 100644 --- a/tests/accuracy/list-databases.test.ts +++ b/tests/accuracy/list-databases.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsListDatabases(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "list-databases", @@ -16,10 +14,8 @@ function callsListDatabases(prompt: string): AccuracyTestConfig { }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call list-databases tool", [ - callsListDatabases("How many databases do I have?"), - callsListDatabases("List all the databases that I have in my clusters"), - callsListDatabases("Is there a mflix database in my cluster?"), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsListDatabases("How many databases do I have?"), + callsListDatabases("List all the databases that I have in my clusters"), + callsListDatabases("Is there a mflix database in my cluster?"), +]); diff --git a/tests/accuracy/logs.test.ts b/tests/accuracy/logs.test.ts index 4ca148b9..8b9d2193 100644 --- a/tests/accuracy/logs.test.ts +++ b/tests/accuracy/logs.test.ts @@ -1,31 +1,27 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function callsLogsTool(prompt: string, toolCall: ExpectedToolCall): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [toolCall], }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should call 'logs' tool", [ - callsLogsTool("Were there any startup warnings for my MongoDB server?", { - toolName: "mongodb-logs", - parameters: { - type: "startupWarnings", - }, - }), - callsLogsTool("Retrieve first 10 logs for my MongoDB server?", { - toolName: "mongodb-logs", - parameters: { - type: "global", - limit: 10, - }, - }), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsLogsTool("Were there any startup warnings for my MongoDB server?", { + toolName: "mongodb-logs", + parameters: { + type: "startupWarnings", + }, + }), + callsLogsTool("Retrieve first 10 logs for my MongoDB server?", { + toolName: "mongodb-logs", + parameters: { + type: "global", + limit: 10, + }, + }), +]); diff --git a/tests/accuracy/rename-collection.test.ts b/tests/accuracy/rename-collection.test.ts index d8d46025..549a02b9 100644 --- a/tests/accuracy/rename-collection.test.ts +++ b/tests/accuracy/rename-collection.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsRenameCollection(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "rename-collection", @@ -22,9 +20,7 @@ function callsRenameCollection(prompt: string): AccuracyTestConfig { function callsRenameCollectionWithDropTarget(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "rename-collection", @@ -39,11 +35,9 @@ function callsRenameCollectionWithDropTarget(prompt: string): AccuracyTestConfig }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call 'rename-collection' tool", [ - callsRenameCollection("Rename my 'mflix.movies' namespace to 'mflix.new_movies'"), - callsRenameCollectionWithDropTarget( - "Rename my 'mflix.movies' namespace to 'mflix.new_movies' while removing the old namespace." - ), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsRenameCollection("Rename my 'mflix.movies' namespace to 'mflix.new_movies'"), + callsRenameCollectionWithDropTarget( + "Rename my 'mflix.movies' namespace to 'mflix.new_movies' while removing the old namespace." + ), +]); diff --git a/tests/accuracy/sdk/accuracy-scorer.ts b/tests/accuracy/sdk/accuracy-scorer.ts new file mode 100644 index 00000000..2ae13e6c --- /dev/null +++ b/tests/accuracy/sdk/accuracy-scorer.ts @@ -0,0 +1,114 @@ +import diff from "microdiff"; +import { ExpectedToolCall, LLMToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; + +/** + * Tool calling accuracy is a single number calculated based on two dimensions. + * 1. Did LLM call the right tool? + * 2. Did LLM call the tool with correct and required parameters? + * + * The number can be one of: + * - 0: When LLM: + * - did not call the right tool + * - did not call the tool with correct parameters + * - 0.75: When LLM: + * - called the right tool but hallucinated and called some extra tools as + * well or called the same tool but with different parameters + * - called the right tool but hallucinated and called it with some + * non-required parameters + * - 1: When LLM: + * - called exactly the tools that were expected + * - called the expected tools exactly with the expected parameters + * + * To calculate this number we must have: + * 1. a list of expected tool calls with their expected parameters + * 2. a list of LLM tool calls with their parameters + * + * For each expected tool call we find the best matching LLM tool call. Best + * matching LLM tool call will have: + * 1. the same name as that of the expected tool call + * 2. highest parameter similarity score, with at-least 0.75 to ensure an actual + * match. And in case of competing scores, we take the first one that appears + * in the LLM tool calls. + * + * Using the above logic we establish pairs between expected and actual tool + * calls. + * + * 1. If we could not pair some LLM tool calls with expected tool calls that + * means the LLM hallucinated over the extra tool calls. For that reason we + * will cap the maximum achievable accuracy to 0.75. + * + * 2. If we could not pair some expected tool calls with LLM tool calls that + * means the LLM did not call one of the expected tool required to solve the + * problem. For that reason we will mark the accuracy as 0 and exit early. + * + * 3. Now for each of the established tool call pairs, we will determine how + * correctly the parameters were called using the parameter similarity score. + * The parameter similarity score follow the same accuracy number pattern + * described above: + * - 0 : for missing parameters, incorrect parameter values + * - 0.75 : for additional parameters + * - 1 : for a perfect match + * + * The final accuracy score is then calculated as the least of: + * - Maximum achievable accuracy from #1 + * - The least of parameter similarity score from the established pairs in #3 + * + * For examples: see the test cases in - tests/unit/accuracy-scorer.test.ts + */ +export function calculateToolCallingAccuracy( + expectedToolCalls: ExpectedToolCall[], + actualToolCalls: LLMToolCall[] +): number { + if (expectedToolCalls.length === 0) { + return actualToolCalls.length === 0 ? 1 : 0.75; + } + + const maxAccuracy = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1; + + const individualAccuracies: number[] = []; + const checkedActualToolCallIndexes = new Set(); + + for (const expectedCall of expectedToolCalls) { + const candidates = actualToolCalls + .map((call, index) => ({ call, index })) + .filter( + ({ call, index }) => !checkedActualToolCallIndexes.has(index) && call.toolName === expectedCall.toolName + ) + .map(({ call, index }) => ({ + call, + index, + score: compareParams(expectedCall.parameters, call.parameters), + })) + .filter(({ score }) => score >= 0.75) + .sort((a, b) => b.score - a.score || a.index - b.index); + + const bestMatch = candidates[0]; + if (!bestMatch) { + individualAccuracies.push(0); + } else { + checkedActualToolCallIndexes.add(bestMatch.index); + const individualAccuracy = Math.min(bestMatch.score, maxAccuracy); + individualAccuracies.push(individualAccuracy); + } + } + + return Math.min(...individualAccuracies); +} + +function compareParams(expected: Record, actual: Record): number { + const differences = diff(expected, actual); + + if (differences.length === 0) { + return 1; + } + + const hasOnlyAdditions = differences.every((d) => d.type === "CREATE"); + const hasRemovals = differences.some((d) => d.type === "REMOVE"); + const hasChanges = differences.some((d) => d.type === "CHANGE"); + + if (hasOnlyAdditions && !hasRemovals && !hasChanges) { + return 0.75; + } + + return 0; +} diff --git a/tests/accuracy/sdk/accuracy-scorers.ts b/tests/accuracy/sdk/accuracy-scorers.ts deleted file mode 100644 index 612c3f80..00000000 --- a/tests/accuracy/sdk/accuracy-scorers.ts +++ /dev/null @@ -1,60 +0,0 @@ -import diff from "microdiff"; -import { ExpectedToolCall, ActualToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; - -export function calculateToolCallingAccuracy( - expectedToolCalls: ExpectedToolCall[], - actualToolCalls: ActualToolCall[] -): number { - if (expectedToolCalls.length === 0) { - return actualToolCalls.length === 0 ? 1 : 0.75; - } - - const maxAccuracy = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1; - - const individualAccuracies: number[] = []; - const checkedActualToolCallIndexes = new Set(); - - for (const expectedCall of expectedToolCalls) { - const candidates = actualToolCalls - .map((call, index) => ({ call, index })) - .filter( - ({ call, index }) => !checkedActualToolCallIndexes.has(index) && call.toolName === expectedCall.toolName - ) - .map(({ call, index }) => ({ - call, - index, - score: compareParams(expectedCall.parameters, call.parameters), - })) - .filter(({ score }) => score >= 0.75) - .sort((a, b) => b.score - a.score); - - const bestMatch = candidates[0]; - if (!bestMatch) { - individualAccuracies.push(0); - } else { - checkedActualToolCallIndexes.add(bestMatch.index); - const individualAccuracy = Math.min(bestMatch.score, maxAccuracy); - individualAccuracies.push(individualAccuracy); - } - } - - return Math.min(...individualAccuracies); -} - -function compareParams(expected: Record, actual: Record): number { - const differences = diff(expected, actual); - - if (differences.length === 0) { - return 1; - } - - const hasOnlyAdditions = differences.every((d) => d.type === "CREATE"); - const hasRemovals = differences.some((d) => d.type === "REMOVE"); - const hasChanges = differences.some((d) => d.type === "CHANGE"); - - if (hasOnlyAdditions && !hasRemovals && !hasChanges) { - return 0.75; - } - - return 0; -} diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts index a4d2bea0..a919e8f0 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts @@ -1,6 +1,4 @@ -import path from "path"; import fs from "fs/promises"; -import { fileURLToPath } from "url"; import { AccuracyRunStatus, AccuracyRunStatuses, @@ -8,10 +6,7 @@ import { AccuracySnapshotEntrySchema, AccuracySnapshotStorage, } from "./snapshot-storage.js"; -const __dirname = fileURLToPath(import.meta.url); -const rootDir = path.resolve(__dirname, "..", "..", "..", "..", ".."); -const snapshotsDir = path.resolve(rootDir, ".accuracy-snapshots"); -export const snapshotFilePath = path.resolve(snapshotsDir, "snapshots.json"); +import { GENERATED_ASSETS_DIR, LOCAL_SNAPSHOTS_FILE } from "../constants.js"; export class DiskSnapshotStorage implements AccuracySnapshotStorage { async createSnapshotEntry( @@ -21,7 +16,6 @@ export class DiskSnapshotStorage implements AccuracySnapshotStorage { | "commitSHA" | "provider" | "requestedModel" - | "test" | "prompt" | "toolCallingAccuracy" | "expectedToolCalls" @@ -95,14 +89,14 @@ export class DiskSnapshotStorage implements AccuracySnapshotStorage { } private async writeSnapshot(snapshot: AccuracySnapshotEntry[]): Promise { - const tmp = `${snapshotFilePath}~${Date.now()}`; + const tmp = `${LOCAL_SNAPSHOTS_FILE}~${Date.now()}`; await fs.writeFile(tmp, JSON.stringify(snapshot, null, 2)); - await fs.rename(tmp, snapshotFilePath); + await fs.rename(tmp, LOCAL_SNAPSHOTS_FILE); } private async readSnapshot(): Promise { try { - const raw = await fs.readFile(snapshotFilePath, "utf8"); + const raw = await fs.readFile(LOCAL_SNAPSHOTS_FILE, "utf8"); return AccuracySnapshotEntrySchema.array().parse(JSON.parse(raw)); } catch (e: unknown) { if ((e as { code: string }).code === "ENOENT") { @@ -117,7 +111,7 @@ export class DiskSnapshotStorage implements AccuracySnapshotStorage { } static async getStorage() { - await fs.mkdir(snapshotsDir, { recursive: true }); + await fs.mkdir(GENERATED_ASSETS_DIR, { recursive: true }); return new DiskSnapshotStorage(); } } diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts index 3bec4c53..da67aa60 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts @@ -1,20 +1,7 @@ -import { getCommitSHA } from "../git-info.js"; import { DiskSnapshotStorage } from "./disk-snapshot-storage.js"; import { MongoDBSnapshotStorage } from "./mdb-snapshot-storage.js"; import { AccuracySnapshotStorage } from "./snapshot-storage.js"; export async function getAccuracySnapshotStorage(): Promise { - const accuracyRunId = process.env.MDB_ACCURACY_RUN_ID; - if (!accuracyRunId) { - throw new Error( - "Cannot create AccuracySnapshotStorage without an accuracyRunId - ensure that the relevant env variable is present." - ); - } - - const commitSHA = await getCommitSHA(); - if (!commitSHA) { - throw new Error("Cannot create AccuracySnapshotStorage without a commitSHA."); - } - return MongoDBSnapshotStorage.getStorage() ?? (await DiskSnapshotStorage.getStorage()); } diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts index 2138b4f0..d3b1b56a 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts @@ -30,7 +30,6 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { | "commitSHA" | "provider" | "requestedModel" - | "test" | "prompt" | "toolCallingAccuracy" | "expectedToolCalls" diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts index e7833456..e0a6966d 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts @@ -1,13 +1,14 @@ import z from "zod"; -const ExpectedToolCallSchema = z.object({ +const LLMToolCallSchema = z.object({ + toolCallId: z.string(), toolName: z.string(), parameters: z.record(z.string(), z.unknown()), }); -export type ExpectedToolCall = z.infer; +export type LLMToolCall = z.infer; -const ActualToolCallSchema = ExpectedToolCallSchema.extend({ toolCallId: z.string() }); -export type ActualToolCall = z.infer; +const ExpectedToolCallSchema = LLMToolCallSchema.omit({ toolCallId: true }); +export type ExpectedToolCall = z.infer; export const AccuracyRunStatus = { Done: "done", @@ -18,23 +19,58 @@ export const AccuracyRunStatus = { export type AccuracyRunStatuses = (typeof AccuracyRunStatus)[keyof typeof AccuracyRunStatus]; export const AccuracySnapshotEntrySchema = z.object({ - // Git and meta information for snapshot entries + /** + * A unique id for each accuracy run. Should either be generated by the + * script triggering the accuracy run or provided via environment variables. + * */ accuracyRunId: z.string(), + + /** + * Represents the status of accuracy run. Each test completion, during an + * accuracy run, is supposed to submit an accuracy snapshot entry with + * InProgress status which then later, after completion of accuracy run, is + * updated to either Done or Failed, depending on whether there were errors + * during the run or not. */ accuracyRunStatus: z .enum([AccuracyRunStatus.Done, AccuracyRunStatus.Failed, AccuracyRunStatus.InProgress]) .default(AccuracyRunStatus.InProgress), + + /** Timestamp of when this snapshot entry was generated. */ createdOn: z.number(), + + /** The commit SHA for which the accuracy run was triggered. */ commitSHA: z.string(), - // Accuracy info + + /** The LLM provider providing the LLM APIs */ provider: z.string(), + + /** The LLM which was requested to respond to our test prompts */ requestedModel: z.string(), - test: z.string(), + + /** The actual prompt that was provided to LLM as test */ prompt: z.string(), + + /** A number between 0 and 1, representing how accurately the expected tools + * were called by LLM when responding to the provided prompts. To know more + * about how this number is generated, check - toolCallingAccuracy.ts */ toolCallingAccuracy: z.number(), - // debug info for further investigations + + /** + * A list of tools, along with their parameters, that are expected to be + * called by the LLM in test. */ expectedToolCalls: ExpectedToolCallSchema.array(), - actualToolCalls: ActualToolCallSchema.array(), + + /** + * A list of tools, along with their parameters, that were actually called + * by the LLM in test. */ + actualToolCalls: LLMToolCallSchema.array(), + + /** + * The total time taken by LLM to respond to our prompt. */ llmResponseTime: z.number(), + + /** + * Token usage data, returned as part of LLM prompt response. */ tokensUsage: z .object({ promptTokens: z.number().optional(), @@ -42,8 +78,20 @@ export const AccuracySnapshotEntrySchema = z.object({ totalTokens: z.number().optional(), }) .optional(), + + /** + * The ID of the model that actually responded to our prompt request. */ respondingModel: z.string(), + + /** + * The final response text generated by the LLM, in response to our prompt + * request. */ text: z.string(), + + /** + * A list of messages, exchanged between LLM and our testing agent, in + * response to our prompt request. This is particularly helpful for + * debugging. */ messages: z.array(z.record(z.string(), z.unknown())), }); @@ -57,7 +105,6 @@ export interface AccuracySnapshotStorage { | "commitSHA" | "provider" | "requestedModel" - | "test" | "prompt" | "toolCallingAccuracy" | "expectedToolCalls" diff --git a/tests/accuracy/sdk/accuracy-testing-client.ts b/tests/accuracy/sdk/accuracy-testing-client.ts index 4a8ad279..d2486942 100644 --- a/tests/accuracy/sdk/accuracy-testing-client.ts +++ b/tests/accuracy/sdk/accuracy-testing-client.ts @@ -1,22 +1,27 @@ -import path from "path"; import { v4 as uuid } from "uuid"; -import { fileURLToPath } from "url"; import { experimental_createMCPClient as createMCPClient, tool as createVercelTool } from "ai"; import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; -import { ActualToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; - -const __dirname = fileURLToPath(import.meta.url); -const distPath = path.join(__dirname, "..", "..", "..", "..", "dist"); -const cliScriptPath = path.join(distPath, "index.js"); +import { MCP_SERVER_CLI_SCRIPT } from "./constants.js"; +import { LLMToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult | Promise; export type MockedTools = Record; +/** + * AccuracyTestingClient is a bridge between actual MCP client connected to our + * MCP server and our Tool calling agent. Its serves the following purposes: + * 1. Captures actual tools provided by our MCP server + * 2. Translates captured MCP tools to tool definitions that can be consumed by + * Tool Calling agent (Ref: `vercelTools`) + * 3. Allow dynamic mocking and resetting of mocks of individual tool calls. + * 4. Records and provides tool calls made by LLMs with their parameters. + */ export class AccuracyTestingClient { private mockedTools: MockedTools = {}; - private recordedToolCalls: ActualToolCall[] = []; + private llmToolCalls: LLMToolCall[] = []; + private constructor(private readonly vercelMCPClient: Awaited>) {} async close() { @@ -30,7 +35,7 @@ export class AccuracyTestingClient { rewrappedVercelTools[toolName] = createVercelTool({ ...tool, execute: async (args, options) => { - this.recordedToolCalls.push({ + this.llmToolCalls.push({ toolCallId: uuid(), toolName: toolName, parameters: args as Record, @@ -44,10 +49,10 @@ export class AccuracyTestingClient { return await tool.execute(args, options); } catch (error) { // There are cases when LLM calls the tools incorrectly - // and the schema definition check fails. Normally a - // tool calling agent will handle the error case but - // because we are wrapping the tool definition ourselves - // we have to handle this ourselves as well. + // and the schema definition check fails. In production, + // the tool calling agents are deployed with this fail + // safe to allow LLM to course correct themselves. That + // is exactly what we do here as well. return { isError: true, content: JSON.stringify(error), @@ -60,8 +65,8 @@ export class AccuracyTestingClient { return rewrappedVercelTools; } - getToolCalls() { - return this.recordedToolCalls; + getLLMToolCalls() { + return this.llmToolCalls; } mockTools(mockedTools: MockedTools) { @@ -70,13 +75,13 @@ export class AccuracyTestingClient { resetForTests() { this.mockTools({}); - this.recordedToolCalls = []; + this.llmToolCalls = []; } static async initializeClient(mdbConnectionString: string) { const clientTransport = new StdioClientTransport({ command: process.execPath, - args: [cliScriptPath, "--connectionString", mdbConnectionString], + args: [MCP_SERVER_CLI_SCRIPT, "--connectionString", mdbConnectionString], }); const client = await createMCPClient({ diff --git a/tests/accuracy/sdk/agent.ts b/tests/accuracy/sdk/agent.ts index 4b5d2621..ee0b5f7f 100644 --- a/tests/accuracy/sdk/agent.ts +++ b/tests/accuracy/sdk/agent.ts @@ -10,14 +10,13 @@ const systemPrompt = [ 'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"', ]; -// Some necessary types from Vercel SDK +// These types are not exported by Vercel SDK so we derive them here to be +// re-used again. export type VercelMCPClient = Awaited>; export type VercelMCPClientTools = Awaited>; export type VercelAgent = ReturnType; -// Generic interface for Agent, in case we need to switch to some other agent -// development SDK -export interface AgentPromptResult { +export interface VercelAgentPromptResult { respondingModel: string; tokensUsage?: { promptTokens?: number; @@ -27,18 +26,21 @@ export interface AgentPromptResult { text: string; messages: Record[]; } + +// Generic interface for Agent, in case we need to switch to some other agent +// development SDK export interface Agent { prompt(prompt: string, model: Model, tools: Tools): Promise; } export function getVercelToolCallingAgent( requestedSystemPrompt?: string -): Agent, VercelMCPClientTools, AgentPromptResult> { +): Agent, VercelMCPClientTools, VercelAgentPromptResult> { return { async prompt(prompt: string, model: Model, tools: VercelMCPClientTools) { const result = await generateText({ model: model.getModel(), - system: [...systemPrompt, requestedSystemPrompt].join("\n"), + system: [...systemPrompt, requestedSystemPrompt].filter(Boolean).join("\n"), prompt, tools, maxSteps: 100, diff --git a/tests/accuracy/sdk/constants.ts b/tests/accuracy/sdk/constants.ts new file mode 100644 index 00000000..cd46a306 --- /dev/null +++ b/tests/accuracy/sdk/constants.ts @@ -0,0 +1,18 @@ +import path from "path"; +import { fileURLToPath } from "url"; + +const __dirname = fileURLToPath(import.meta.url); + +export const ROOT_DIR = path.join(__dirname, "..", "..", "..", ".."); + +export const DIST_DIR = path.join(ROOT_DIR, "dist"); + +export const MCP_SERVER_CLI_SCRIPT = path.join(DIST_DIR, "index.js"); + +export const TEST_DATA_DUMPS_DIR = path.join(__dirname, "test-data-dumps"); + +export const GENERATED_ASSETS_DIR = path.join(ROOT_DIR, ".accuracy"); + +export const LOCAL_SNAPSHOTS_FILE = path.join(GENERATED_ASSETS_DIR, "snapshots.json"); + +export const HTML_REPORT_FILE = path.join(GENERATED_ASSETS_DIR, "report.html"); diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 1dd6d971..2a358ce1 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -1,5 +1,5 @@ import { TestableModels } from "./models.js"; -import { calculateToolCallingAccuracy } from "./accuracy-scorers.js"; +import { calculateToolCallingAccuracy } from "./accuracy-scorer.js"; import { getVercelToolCallingAgent, VercelAgent } from "./agent.js"; import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/tools/mongodb/mongodbHelpers.js"; import { AccuracyTestingClient, MockedTools } from "./accuracy-testing-client.js"; @@ -8,25 +8,39 @@ import { AccuracySnapshotStorage, ExpectedToolCall } from "./accuracy-snapshot-s import { getCommitSHA } from "./git-info.js"; export interface AccuracyTestConfig { - systemPrompt?: string; - injectConnectedAssumption?: boolean; + /** The prompt to be provided to LLM for evaluation. */ prompt: string; + + /** + * A list of tools and their parameters that we expect LLM to call based on + * how vague or detailed the prompt is. Ideally this should be a list of + * bare minimum and critical tool calls that are required to solve the + * problem mentioned in the prompt but because, for even a slightly vague + * prompt, LLM might decide to do additional confirmation by calling other + * tools, its fine to include those other tool calls as well to get a + * perfect 1 on the tool calling accuracy score. */ expectedToolCalls: ExpectedToolCall[]; - mockedTools: MockedTools; -} -export function describeSuite(suiteName: string, testConfigs: AccuracyTestConfig[]) { - return { - [suiteName]: testConfigs, - }; + /** + * The additional system prompt to be appended to already injected system + * prompt. */ + systemPrompt?: string; + + /** + * A small hint appended to the actual prompt in test, which is supposed to + * hint LLM to assume that the MCP server is already connected so that it + * does not call the connect tool. + * By default it is assumed to be true */ + injectConnectedAssumption?: boolean; + + /** + * A map of tool names to their mocked implementation. When the mocked + * implementations are available, the testing client will prefer those over + * actual MCP tool calls. */ + mockedTools?: MockedTools; } -export function describeAccuracyTests( - models: TestableModels, - accuracyTestConfigs: { - [suiteName: string]: AccuracyTestConfig[]; - } -) { +export function describeAccuracyTests(models: TestableModels, accuracyTestConfigs: AccuracyTestConfig[]) { if (!process.env.MDB_ACCURACY_RUN_ID) { throw new Error("MDB_ACCURACY_RUN_ID env variable is required for accuracy test runs!"); } @@ -36,13 +50,12 @@ export function describeAccuracyTests( } const eachModel = describe.each(models); - const eachSuite = describe.each(Object.keys(accuracyTestConfigs)); eachModel(`$displayName`, function (model) { + const accuracyRunId = `${process.env.MDB_ACCURACY_RUN_ID}`; const mdbIntegration = setupMongoDBIntegrationTest(); const { populateTestData, cleanupTestDatabases } = prepareTestData(mdbIntegration); - const accuracyRunId: string = `${process.env.MDB_ACCURACY_RUN_ID}`; let commitSHA: string; let accuracySnapshotStorage: AccuracySnapshotStorage; let testMCPClient: AccuracyTestingClient; @@ -53,8 +66,8 @@ export function describeAccuracyTests( if (!retrievedCommitSHA) { throw new Error("Could not derive commitSHA, exiting accuracy tests!"); } - commitSHA = retrievedCommitSHA; + accuracySnapshotStorage = await getAccuracySnapshotStorage(); testMCPClient = await AccuracyTestingClient.initializeClient(mdbIntegration.connectionString()); agent = getVercelToolCallingAgent(); @@ -67,40 +80,39 @@ export function describeAccuracyTests( }); afterAll(async () => { - await accuracySnapshotStorage.close(); - await testMCPClient.close(); + await accuracySnapshotStorage?.close(); + await testMCPClient?.close(); }); - eachSuite("%s", function (suiteName) { - const eachTest = it.each(accuracyTestConfigs[suiteName] ?? []); - - eachTest("$prompt", async function (testConfig) { - testMCPClient.mockTools(testConfig.mockedTools); - const toolsForModel = await testMCPClient.vercelTools(); - const promptForModel = testConfig.injectConnectedAssumption - ? [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" ") - : testConfig.prompt; - - const timeBeforePrompt = Date.now(); - const result = await agent.prompt(promptForModel, model, toolsForModel); - const timeAfterPrompt = Date.now(); - const toolCalls = testMCPClient.getToolCalls(); - const toolCallingAccuracy = calculateToolCallingAccuracy(testConfig.expectedToolCalls, toolCalls); - - const responseTime = timeAfterPrompt - timeBeforePrompt; - await accuracySnapshotStorage.createSnapshotEntry({ - accuracyRunId, - commitSHA, - provider: model.provider, - requestedModel: model.modelName, - test: suiteName, - prompt: testConfig.prompt, - llmResponseTime: responseTime, - toolCallingAccuracy: toolCallingAccuracy, - actualToolCalls: toolCalls, - expectedToolCalls: testConfig.expectedToolCalls, - ...result, - }); + const eachTest = it.each(accuracyTestConfigs); + + eachTest("$prompt", async function (testConfig) { + testMCPClient.mockTools(testConfig.mockedTools ?? {}); + const toolsForModel = await testMCPClient.vercelTools(); + const promptForModel = + testConfig.injectConnectedAssumption === false + ? testConfig.prompt + : [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" "); + + const timeBeforePrompt = Date.now(); + const result = await agent.prompt(promptForModel, model, toolsForModel); + const timeAfterPrompt = Date.now(); + + const llmToolCalls = testMCPClient.getLLMToolCalls(); + const toolCallingAccuracy = calculateToolCallingAccuracy(testConfig.expectedToolCalls, llmToolCalls); + + const responseTime = timeAfterPrompt - timeBeforePrompt; + await accuracySnapshotStorage.createSnapshotEntry({ + accuracyRunId, + commitSHA, + provider: model.provider, + requestedModel: model.modelName, + prompt: testConfig.prompt, + llmResponseTime: responseTime, + toolCallingAccuracy: toolCallingAccuracy, + actualToolCalls: llmToolCalls, + expectedToolCalls: testConfig.expectedToolCalls, + ...result, }); }); }); diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index 70b80435..9f47028f 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -14,11 +14,10 @@ export interface Model

{ export class OpenAIModel implements Model { readonly provider = "OpenAI"; + readonly displayName: string; - constructor(readonly modelName: string) {} - - get displayName(): string { - return `${this.provider} - ${this.modelName}`; + constructor(readonly modelName: string) { + this.displayName = `${this.provider} - ${modelName}`; } isAvailable(): boolean { @@ -34,11 +33,10 @@ export class OpenAIModel implements Model { export class AzureOpenAIModel implements Model { readonly provider = "Azure"; + readonly displayName: string; - constructor(readonly modelName: string) {} - - get displayName(): string { - return `${this.provider} - ${this.modelName}`; + constructor(readonly modelName: string) { + this.displayName = `${this.provider} - ${modelName}`; } isAvailable(): boolean { @@ -56,11 +54,10 @@ export class AzureOpenAIModel implements Model { export class GeminiModel implements Model { readonly provider = "Google"; + readonly displayName: string; - constructor(readonly modelName: string) {} - - get displayName(): string { - return `${this.provider} - ${this.modelName}`; + constructor(readonly modelName: string) { + this.displayName = `${this.provider} - ${modelName}`; } isAvailable(): boolean { @@ -76,11 +73,10 @@ export class GeminiModel implements Model { export class OllamaModel implements Model { readonly provider = "Ollama"; + readonly displayName: string; - constructor(readonly modelName: string) {} - - get displayName(): string { - return `${this.provider} - ${this.modelName}`; + constructor(readonly modelName: string) { + this.displayName = `${this.provider} - ${modelName}`; } isAvailable(): boolean { diff --git a/tests/accuracy/update-many.test.ts b/tests/accuracy/update-many.test.ts index 4b82fbfb..86f96705 100644 --- a/tests/accuracy/update-many.test.ts +++ b/tests/accuracy/update-many.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsUpdateManyWithEmptyFilters(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "update-many", @@ -26,9 +24,7 @@ function callsUpdateManyWithEmptyFilters(prompt: string): AccuracyTestConfig { function callsUpdateManyWithFilters(prompt: string, filter: Record): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "update-many", @@ -47,14 +43,12 @@ function callsUpdateManyWithFilters(prompt: string, filter: Record { + describe("edge cases", () => { + it("should return 1 when both expected and actual are empty", () => { + const result = calculateToolCallingAccuracy([], []); + expect(result).toBe(1); + }); + + it("should return 0.75 when expected is empty but actual has tool calls", () => { + const actualToolCalls: LLMToolCall[] = [{ toolCallId: "1", toolName: "find", parameters: { db: "test" } }]; + const result = calculateToolCallingAccuracy([], actualToolCalls); + expect(result).toBe(0.75); + }); + + it("should return 0 when expected has tool calls but actual is empty", () => { + const expectedToolCalls: ExpectedToolCall[] = [{ toolName: "find", parameters: { db: "test" } }]; + const result = calculateToolCallingAccuracy(expectedToolCalls, []); + expect(result).toBe(0); + }); + }); + + describe("perfect matches", () => { + it("should return 1 for exact match with nested parameters", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: { db: "test", collection: "users", filter: { age: { $gte: 18 }, status: "active" } }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { age: { $gte: 18 }, status: "active" } }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(1); + }); + + it("should return 1 for exact match with multiple diverse tool calls", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + { + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] }, + }, + { toolName: "count", parameters: { db: "test", collection: "products" } }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + { + toolCallId: "2", + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] }, + }, + { toolCallId: "3", toolName: "count", parameters: { db: "test", collection: "products" } }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(1); + }); + }); + + describe("additional parameters", () => { + it("should return 0.75 when tool call has additional nested parameters", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { + db: "test", + collection: "users", + filter: { status: "active", age: { $gte: 18 } }, + limit: 10, + }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0.75); + }); + }); + + describe("missing or incorrect parameters", () => { + it("should return 0 when tool call has missing nested parameters", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active", age: { $gte: 18 } } }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); + }); + + it("should return 0 when aggregate tool call has incorrect pipeline", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $lt: 50 } } }] }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); + }); + }); + + describe("additional tool calls", () => { + it("should cap accuracy at 0.75 when LLM calls extra tools", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + { toolCallId: "2", toolName: "count", parameters: { db: "test", collection: "orders" } }, + { + toolCallId: "3", + toolName: "aggregate", + parameters: { + db: "test", + collection: "products", + pipeline: [{ $group: { _id: "$category", total: { $sum: 1 } } }], + }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0.75); + }); + + it("should cap accuracy at 0.75 when LLM calls same tool multiple times with variations", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + { + toolCallId: "2", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active", age: { $gte: 18 } } }, + }, + { toolCallId: "3", toolName: "find", parameters: { db: "test", collection: "users", limit: 10 } }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0.75); + }); + }); + + describe("missing tool calls", () => { + it("should return 0 if any expected tool call was not called", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + { + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + // Missing the aggregate tool call + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); // One expected tool call was not called + }); + }); +}); From ebcc19df6e4ad57260d006193792d2eccdadf1cb Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 10 Jul 2025 10:52:36 +0200 Subject: [PATCH 39/91] chore: generate accuracy test summary post test --- resources/test-summary-template.html | 337 +++++++++++++++++++++++++++ scripts/generate-test-summary.ts | 156 +++++++++++++ scripts/run-accuracy-tests.sh | 1 + tests/accuracy/sdk/constants.ts | 6 +- 4 files changed, 499 insertions(+), 1 deletion(-) create mode 100644 resources/test-summary-template.html create mode 100644 scripts/generate-test-summary.ts diff --git a/resources/test-summary-template.html b/resources/test-summary-template.html new file mode 100644 index 00000000..318e9550 --- /dev/null +++ b/resources/test-summary-template.html @@ -0,0 +1,337 @@ + + + + + + MongoDB MCP Server - Accuracy Test Summary + + + +

+

📊 MongoDB MCP Server - Accuracy Test Summary

+
+

Run Information & Summary

+
+
+
Accuracy Run ID
+
{{accuracyRunId}}
+
+
+
Accuracy Run Status
+
{{runStatusUpper}}
+
+
+
Commit SHA
+
{{commitSHA}}
+
+
+
Report Generated On
+
{{reportGeneratedOn}}
+
+
+
Snapshots Captured On
+
{{createdOn}}
+
+
+
Total Prompts Evaluated
+
{{totalTests}}
+
+
+
Models Tested
+
{{modelsCount}}
+
+
+
Evals with 0% Accuracy
+
{{testsWithZeroAccuracy}}
+
+
+
+ + + + + + + + + + + + + + {{tableRows}} + +
PromptModelExpected Tool CallsLLM Tool CallsAccuracyLLM Response Time (ms)Total Tokens Used
+
+ + + diff --git a/scripts/generate-test-summary.ts b/scripts/generate-test-summary.ts new file mode 100644 index 00000000..efeacbdc --- /dev/null +++ b/scripts/generate-test-summary.ts @@ -0,0 +1,156 @@ +import { readFile, writeFile } from "fs/promises"; +import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js"; +import { HTML_TESTS_SUMMARY_FILE, HTML_TESTS_SUMMARY_TEMPLATE } from "../tests/accuracy/sdk/constants.js"; +import type { + AccuracySnapshotEntry, + ExpectedToolCall, + LLMToolCall, +} from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; + +function populateTemplate(template: string, data: Record): string { + return template.replace(/\{\{(\w+)\}\}/g, (_, key: string) => data[key] ?? ""); +} + +function formatAccuracy(accuracy: number): string { + return (accuracy * 100).toFixed(1) + "%"; +} + +function getAccuracyClass(accuracy: number): string { + if (accuracy === 1) return "accuracy-perfect"; + if (accuracy >= 0.75) return "accuracy-good"; + return "accuracy-poor"; +} + +function formatToolCallsWithTooltip(toolCalls: ExpectedToolCall[] | LLMToolCall[]): string { + return toolCalls + .map((call) => { + const params = JSON.stringify(call.parameters, null, 2); + return `${call.toolName}`; + }) + .join(", "); +} + +function formatTokenUsage(tokensUsage: { + promptTokens?: number; + completionTokens?: number; + totalTokens?: number; +}): string { + const total = tokensUsage.totalTokens || 0; + const prompt = tokensUsage.promptTokens || 0; + const completion = tokensUsage.completionTokens || 0; + + const tooltip = `Prompt: ${prompt}\nCompletion: ${completion}\nTotal: ${total}`; + return `${total}`; +} + +function formatMessages(messages: Array>): string { + return messages.map((msg) => JSON.stringify(msg, null, 2)).join("\n\n"); +} + +async function generateHtmlReport(snapshotEntries: AccuracySnapshotEntry[], accuracyRunId: string): Promise { + const totalPrompts = snapshotEntries.length; + const modelsCount = new Set(snapshotEntries.map((s) => `${s.provider} ${s.requestedModel}`)).size; + const testsWithZeroAccuracy = snapshotEntries.filter((snapshotEntry) => snapshotEntry.toolCallingAccuracy === 0); + + const firstSnapshotEntry = snapshotEntries[0]; + const runStatus = firstSnapshotEntry?.accuracyRunStatus || "unknown"; + const commitSHA = firstSnapshotEntry?.commitSHA || "unknown"; + const createdOn = firstSnapshotEntry?.createdOn + ? new Date(firstSnapshotEntry.createdOn).toLocaleString() + : "unknown"; + const reportGeneratedOn = new Date().toLocaleString(); + + const tableRows = snapshotEntries + .map( + (snapshotEntry, index) => ` + + + + ${snapshotEntry.prompt} + + ${snapshotEntry.provider} - ${snapshotEntry.requestedModel} + ${formatToolCallsWithTooltip(snapshotEntry.expectedToolCalls)} + ${formatToolCallsWithTooltip(snapshotEntry.actualToolCalls)} + + + ${formatAccuracy(snapshotEntry.toolCallingAccuracy)} + + + ${snapshotEntry.llmResponseTime.toFixed(2)} + ${formatTokenUsage(snapshotEntry.tokensUsage || {})} + + + +
+
+

🤖 LLM Response

+
${snapshotEntry.text}
+
+
+

💬 Conversation Messages

+
${formatMessages(snapshotEntry.messages)}
+
+
+ + + ` + ) + .join(""); + + // Read template file + const template = await readFile(HTML_TESTS_SUMMARY_TEMPLATE, "utf8"); + // Fill template + return populateTemplate(template, { + accuracyRunId, + runStatus, + runStatusUpper: runStatus.toUpperCase(), + commitSHA, + reportGeneratedOn, + createdOn, + totalTests: String(totalPrompts), + modelsCount: String(modelsCount), + testsWithZeroAccuracy: String(testsWithZeroAccuracy.length), + tableRows, + }); +} + +async function generateTestSummary(): Promise { + try { + const accuracyRunId = process.env.MDB_ACCURACY_RUN_ID; + if (!accuracyRunId) { + throw new Error("Cannot generate test summary, accuracy run id is unknown"); + } + console.log(`\n📊 Generating test summary for accuracy run: ${accuracyRunId}\n`); + + const storage = await getAccuracySnapshotStorage(); + const snapshot = await storage.getSnapshotForAccuracyRun(accuracyRunId); + await storage.close(); + + if (snapshot.length === 0) { + console.log("No snapshots found for the current run."); + return; + } + + const htmlReport = await generateHtmlReport(snapshot, accuracyRunId); + + const reportPath = HTML_TESTS_SUMMARY_FILE; + await writeFile(reportPath, htmlReport, "utf8"); + + console.log(`✅ HTML report generated: ${reportPath}`); + + const totalPrompts = snapshot.length; + const modelsCount = new Set(snapshot.map((s) => `${s.provider} ${s.requestedModel}`)).size; + const testsWithZeroAccuracy = snapshot.filter((snapshotEntry) => snapshotEntry.toolCallingAccuracy === 0); + + console.log(`\n📈 Summary:`); + console.log(` Total prompts evaluated: ${totalPrompts}`); + console.log(` Models tested: ${modelsCount}`); + console.log(` Evals with 0% accuracy: ${testsWithZeroAccuracy.length}`); + console.log(` Report saved to: ${reportPath}\n`); + } catch (error) { + console.error("Error generating test summary:", error); + process.exit(1); + } +} + +void generateTestSummary(); diff --git a/scripts/run-accuracy-tests.sh b/scripts/run-accuracy-tests.sh index e009661f..d6df473f 100644 --- a/scripts/run-accuracy-tests.sh +++ b/scripts/run-accuracy-tests.sh @@ -39,6 +39,7 @@ JEST_EXIT_CODE=$? # want to compare against an incomplete run. if [ $JEST_EXIT_CODE -eq 0 ]; then MDB_ACCURACY_RUN_STATUS="done" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'done'" + npx tsx scripts/generate-test-summary.ts || echo "Warning: Failed to generate test summary HTML report" else MDB_ACCURACY_RUN_STATUS="failed" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'failed'" fi diff --git a/tests/accuracy/sdk/constants.ts b/tests/accuracy/sdk/constants.ts index cd46a306..0598b1a7 100644 --- a/tests/accuracy/sdk/constants.ts +++ b/tests/accuracy/sdk/constants.ts @@ -7,6 +7,8 @@ export const ROOT_DIR = path.join(__dirname, "..", "..", "..", ".."); export const DIST_DIR = path.join(ROOT_DIR, "dist"); +export const RESOURCES_DIR = path.join(ROOT_DIR, "resources"); + export const MCP_SERVER_CLI_SCRIPT = path.join(DIST_DIR, "index.js"); export const TEST_DATA_DUMPS_DIR = path.join(__dirname, "test-data-dumps"); @@ -15,4 +17,6 @@ export const GENERATED_ASSETS_DIR = path.join(ROOT_DIR, ".accuracy"); export const LOCAL_SNAPSHOTS_FILE = path.join(GENERATED_ASSETS_DIR, "snapshots.json"); -export const HTML_REPORT_FILE = path.join(GENERATED_ASSETS_DIR, "report.html"); +export const HTML_TESTS_SUMMARY_FILE = path.join(GENERATED_ASSETS_DIR, "tests-summary.html"); + +export const HTML_TESTS_SUMMARY_TEMPLATE = path.join(RESOURCES_DIR, "test-summary-template.html"); From b1bf731d27071059212c7f593e63a65d749bd34f Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 10 Jul 2025 11:55:15 +0200 Subject: [PATCH 40/91] chore: add Github workflow to trigger test runs --- .github/workflows/accuracy-tests.yml | 44 ++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 .github/workflows/accuracy-tests.yml diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml new file mode 100644 index 00000000..89188c16 --- /dev/null +++ b/.github/workflows/accuracy-tests.yml @@ -0,0 +1,44 @@ +name: Accuracy Tests + +on: + workflow_dispatch: + pull_request: + types: [labeled] + +jobs: + run-accuracy-tests: + name: Run Accuracy Tests + runs-on: ubuntu-latest + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests') + env: + MDB_OPEN_AI_API_KEY: ${{ secrets.MDB_OPEN_AI_API_KEY }} + MDB_GEMINI_API_KEY: ${{ secrets.MDB_GEMINI_API_KEY }} + MDB_AZURE_OPEN_AI_API_KEY: ${{ secrets.MDB_AZURE_OPEN_AI_API_KEY }} + MDB_AZURE_OPEN_AI_API_URL: ${{ secrets.MDB_AZURE_OPEN_AI_API_URL }} + MDB_ACCURACY_MDB_URL: ${{ secrets.MDB_ACCURACY_MDB_URL }} + MDB_ACCURACY_MDB_DB: ${{ secrets.MDB_ACCURACY_MDB_DB }} + MDB_ACCURACY_MDB_COLLECTION: ${{ secrets.MDB_ACCURACY_MDB_COLLECTION }} + steps: + - uses: GitHubSecurityLab/actions-permissions/monitor@v1 + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version-file: package.json + cache: 'npm' + - name: Install dependencies + run: npm ci + - name: Run accuracy tests + run: ./scripts/run-accuracy-tests.sh + - name: Upload accuracy test summary + if: always() + uses: actions/upload-artifact@v4 + with: + name: accuracy-test-summary + path: .accuracy/tests-summary.html + - name: Comment summary on PR + if: github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests' + uses: marocchino/sticky-pull-request-comment@v2 + with: + path: .accuracy/tests-summary.html \ No newline at end of file From 2e08208df01c216348f7e1a75273366ba2909e1a Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 10 Jul 2025 13:21:42 +0200 Subject: [PATCH 41/91] chore: fix permissions issue --- .github/workflows/accuracy-tests.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index 89188c16..955b792a 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -9,6 +9,9 @@ jobs: run-accuracy-tests: name: Run Accuracy Tests runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write if: | github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests') @@ -26,7 +29,7 @@ jobs: - uses: actions/setup-node@v4 with: node-version-file: package.json - cache: 'npm' + cache: "npm" - name: Install dependencies run: npm ci - name: Run accuracy tests @@ -41,4 +44,4 @@ jobs: if: github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests' uses: marocchino/sticky-pull-request-comment@v2 with: - path: .accuracy/tests-summary.html \ No newline at end of file + path: .accuracy/tests-summary.html From 509a23c4c5acd2fce5ac05d77dbe1e1b4f9d96a3 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 10 Jul 2025 13:35:27 +0200 Subject: [PATCH 42/91] chore: bring back packages post merge --- package-lock.json | 9 +++++++++ package.json | 10 ++++++++++ scripts/run-accuracy-tests.sh | 2 +- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/package-lock.json b/package-lock.json index b5405ad5..2627186b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -31,18 +31,25 @@ "mongodb-mcp-server": "dist/index.js" }, "devDependencies": { + "@ai-sdk/anthropic": "^1.2.12", + "@ai-sdk/azure": "^1.3.23", + "@ai-sdk/openai": "^1.3.22", "@eslint/js": "^9.30.1", + "@himanshusinghs/google": "^1.2.11", "@modelcontextprotocol/inspector": "^0.16.0", "@redocly/cli": "^1.34.4", "@types/node": "^24.0.12", "@types/simple-oauth2": "^5.0.7", "@types/yargs-parser": "^21.0.3", "@vitest/coverage-v8": "^3.2.4", + "ai": "^4.3.16", "eslint": "^9.30.1", "eslint-config-prettier": "^10.1.5", "eslint-plugin-prettier": "^5.5.1", "globals": "^16.3.0", + "microdiff": "^1.5.0", "mongodb-runner": "^5.9.2", + "ollama-ai-provider": "^1.2.0", "openapi-types": "^12.1.3", "openapi-typescript": "^7.8.0", "prettier": "^3.6.2", @@ -50,6 +57,8 @@ "typescript": "^5.8.3", "typescript-eslint": "^8.36.0", "vitest": "^3.2.4", + "simple-git": "^3.28.0", + "uuid": "^11.1.0", "yaml": "^2.8.0" }, "engines": { diff --git a/package.json b/package.json index 205a2bac..612671be 100644 --- a/package.json +++ b/package.json @@ -30,22 +30,30 @@ "reformat": "prettier --write .", "generate": "./scripts/generate.sh", "test": "vitest --coverage", + "pre:test:accuracy": "npm run build:compile", "test:accuracy": "sh ./scripts/run-accuracy-tests.sh" }, "license": "Apache-2.0", "devDependencies": { + "@ai-sdk/anthropic": "^1.2.12", + "@ai-sdk/azure": "^1.3.23", + "@ai-sdk/openai": "^1.3.22", "@eslint/js": "^9.30.1", + "@himanshusinghs/google": "^1.2.11", "@modelcontextprotocol/inspector": "^0.16.0", "@redocly/cli": "^1.34.4", "@types/node": "^24.0.12", "@types/simple-oauth2": "^5.0.7", "@types/yargs-parser": "^21.0.3", "@vitest/coverage-v8": "^3.2.4", + "ai": "^4.3.16", "eslint": "^9.30.1", "eslint-config-prettier": "^10.1.5", "eslint-plugin-prettier": "^5.5.1", "globals": "^16.3.0", + "microdiff": "^1.5.0", "mongodb-runner": "^5.9.2", + "ollama-ai-provider": "^1.2.0", "openapi-types": "^12.1.3", "openapi-typescript": "^7.8.0", "prettier": "^3.6.2", @@ -53,6 +61,8 @@ "typescript": "^5.8.3", "typescript-eslint": "^8.36.0", "vitest": "^3.2.4", + "simple-git": "^3.28.0", + "uuid": "^11.1.0", "yaml": "^2.8.0" }, "dependencies": { diff --git a/scripts/run-accuracy-tests.sh b/scripts/run-accuracy-tests.sh index d6df473f..ae02dd06 100644 --- a/scripts/run-accuracy-tests.sh +++ b/scripts/run-accuracy-tests.sh @@ -18,7 +18,7 @@ export MDB_ACCURACY_RUN_ID=$(npx uuid v4) # npm run test:accuracy -- tests/accuracy/some-test.test.ts TEST_PATH_PATTERN="${1:-tests/accuracy}" shift || true -node --experimental-vm-modules node_modules/jest/bin/jest.js --bail --testPathPattern "$TEST_PATH_PATTERN" "$@" +node --experimental-vm-modules node_modules/jest/bin/jest.js --bail --testPathPatterns "$TEST_PATH_PATTERN" "$@" # Preserving the exit code from test run to correctly notify in the CI # environments when the tests fail. From be957b51b0bfd4e9d52dfa2c98881dfe907ccb67 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 10 Jul 2025 22:53:50 +0200 Subject: [PATCH 43/91] chore: update report generation to include comparison with baseline as well --- .github/workflows/accuracy-tests.yml | 1 + resources/test-summary-template.html | 104 ++++++++++-- scripts/generate-test-summary.ts | 158 ++++++++++++++++-- .../mdb-snapshot-storage.ts | 2 +- 4 files changed, 234 insertions(+), 31 deletions(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index 955b792a..640fdd1a 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -23,6 +23,7 @@ jobs: MDB_ACCURACY_MDB_URL: ${{ secrets.MDB_ACCURACY_MDB_URL }} MDB_ACCURACY_MDB_DB: ${{ secrets.MDB_ACCURACY_MDB_DB }} MDB_ACCURACY_MDB_COLLECTION: ${{ secrets.MDB_ACCURACY_MDB_COLLECTION }} + MDB_ACCURACY_BASELINE_COMMIT: ${{ github.event.pull_request.base.sha || '' }} steps: - uses: GitHubSecurityLab/actions-permissions/monitor@v1 - uses: actions/checkout@v4 diff --git a/resources/test-summary-template.html b/resources/test-summary-template.html index 318e9550..903457f8 100644 --- a/resources/test-summary-template.html +++ b/resources/test-summary-template.html @@ -31,16 +31,30 @@ background: #f8f9fa; padding: 20px; border-radius: 6px; - margin-bottom: 30px; + margin-bottom: 20px; border-left: 4px solid #00684a; } + .header-info:nth-child(3) { + border-left-color: #007bff; + } + .header-info:nth-child(4) { + border-left-color: #28a745; + } .header-info h2 { margin-top: 0; + margin-bottom: 15px; color: #00684a; + font-size: 1.2em; + } + .header-info:nth-child(3) h2 { + color: #007bff; + } + .header-info:nth-child(4) h2 { + color: #28a745; } .info-grid { display: grid; - grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); + grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin-top: 15px; } @@ -158,22 +172,22 @@ .accuracy-perfect { background-color: #d4edda; color: #155724; - padding: 4px 8px; - border-radius: 4px; + padding: 2px 6px; + border-radius: 3px; font-weight: bold; } .accuracy-good { background-color: #fff3cd; color: #856404; - padding: 4px 8px; - border-radius: 4px; + padding: 2px 6px; + border-radius: 3px; font-weight: bold; } .accuracy-poor { background-color: #f8d7da; color: #721c24; - padding: 4px 8px; - border-radius: 4px; + padding: 2px 6px; + border-radius: 3px; font-weight: bold; } .tool-call { @@ -215,6 +229,29 @@ min-width: 80px; text-align: center; } + .baseline-accuracy-cell { + width: 8%; + min-width: 80px; + text-align: center; + } + .accuracy-comparison { + background: #e9ecef; + padding: 2px 6px; + border-radius: 3px; + font-weight: bold; + } + .accuracy-improved { + background: #d4edda; + color: #155724; + } + .accuracy-regressed { + background: #f8d7da; + color: #721c24; + } + .accuracy-same { + background: #e2e3e5; + color: #495057; + } .response-time-cell { width: 10%; min-width: 100px; @@ -264,28 +301,30 @@

📊 MongoDB MCP Server - Accuracy Test Summary

-

Run Information & Summary

+

📊 Current Run Information

Accuracy Run ID
{{accuracyRunId}}
-
-
Accuracy Run Status
-
{{runStatusUpper}}
-
Commit SHA
{{commitSHA}}
-
Report Generated On
-
{{reportGeneratedOn}}
+
Run Created On
+
{{createdOn}}
-
Snapshots Captured On
-
{{createdOn}}
+
Report Generated On
+
{{reportGeneratedOn}}
+
+
+ +
+

📈 Test Results Summary

+
Total Prompts Evaluated
{{totalTests}}
@@ -298,6 +337,36 @@

Run Information & Summary

Evals with 0% Accuracy
{{testsWithZeroAccuracy}}
+
+
Average Accuracy
+
{{averageAccuracy}}
+
+
+
+ +
+

🔄 Baseline Comparison

+
+
+
Baseline Accuracy Run ID
+
{{baselineAccuracyRunId}}
+
+
+
Baseline Commit SHA
+
{{baselineCommitSHA}}
+
+
+
Baseline Run Created On
+
{{baselineCreatedOn}}
+
+
+
Evals Improved vs Baseline
+
{{evalsImproved}}
+
+
+
Evals Regressed vs Baseline
+
{{evalsRegressed}}
+
@@ -308,6 +377,7 @@

Run Information & Summary

+ diff --git a/scripts/generate-test-summary.ts b/scripts/generate-test-summary.ts index efeacbdc..fba40610 100644 --- a/scripts/generate-test-summary.ts +++ b/scripts/generate-test-summary.ts @@ -7,6 +7,15 @@ import type { LLMToolCall, } from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; +interface BaselineComparison { + baselineAccuracy?: number; + comparisonResult?: "improved" | "regressed" | "same"; +} + +interface SnapshotEntryWithBaseline extends AccuracySnapshotEntry { + baseline?: BaselineComparison; +} + function populateTemplate(template: string, data: Record): string { return template.replace(/\{\{(\w+)\}\}/g, (_, key: string) => data[key] ?? ""); } @@ -47,11 +56,91 @@ function formatMessages(messages: Array>): string { return messages.map((msg) => JSON.stringify(msg, null, 2)).join("\n\n"); } -async function generateHtmlReport(snapshotEntries: AccuracySnapshotEntry[], accuracyRunId: string): Promise { +function formatBaselineAccuracy(snapshot: SnapshotEntryWithBaseline): string { + if (!snapshot.baseline || snapshot.baseline.baselineAccuracy === undefined) { + return 'N/A'; + } + + const baselineAccuracyText = formatAccuracy(snapshot.baseline.baselineAccuracy); + let comparisonClass = "accuracy-comparison"; + let comparisonIcon = ""; + + if (snapshot.baseline.comparisonResult) { + switch (snapshot.baseline.comparisonResult) { + case "improved": + comparisonClass += " accuracy-improved"; + comparisonIcon = " ↗"; + break; + case "regressed": + comparisonClass += " accuracy-regressed"; + comparisonIcon = " ↘"; + break; + case "same": + comparisonClass += " accuracy-same"; + comparisonIcon = " →"; + break; + } + } + + return `${baselineAccuracyText}${comparisonIcon}`; +} + +function compareSnapshotEntries( + currentSnapshotEntries: AccuracySnapshotEntry[], + baselineSnapshotEntries: AccuracySnapshotEntry[] +): SnapshotEntryWithBaseline[] { + const baselineMap = new Map(); + baselineSnapshotEntries.forEach((entry) => { + const key = `${entry.provider}|${entry.requestedModel}|${entry.prompt}`; + baselineMap.set(key, entry); + }); + + return currentSnapshotEntries.map((entry) => { + const key = `${entry.provider}|${entry.requestedModel}|${entry.prompt}`; + const baselineEntry = baselineMap.get(key); + + if (!baselineEntry) { + return entry; + } + + let comparisonResult: "improved" | "regressed" | "same"; + if (entry.toolCallingAccuracy > baselineEntry.toolCallingAccuracy) { + comparisonResult = "improved"; + } else if (entry.toolCallingAccuracy < baselineEntry.toolCallingAccuracy) { + comparisonResult = "regressed"; + } else { + comparisonResult = "same"; + } + + return { + ...entry, + baseline: { + baselineAccuracy: baselineEntry.toolCallingAccuracy, + comparisonResult, + }, + }; + }); +} + +async function generateHtmlReport( + snapshotEntries: SnapshotEntryWithBaseline[], + accuracyRunId: string, + baselineInfo?: { + commitSHA: string; + accuracyRunId: string; + createdOn: string; + } +): Promise { const totalPrompts = snapshotEntries.length; const modelsCount = new Set(snapshotEntries.map((s) => `${s.provider} ${s.requestedModel}`)).size; const testsWithZeroAccuracy = snapshotEntries.filter((snapshotEntry) => snapshotEntry.toolCallingAccuracy === 0); + const totalAccuracy = snapshotEntries.reduce((sum, entry) => sum + entry.toolCallingAccuracy, 0); + const averageAccuracy = totalPrompts > 0 ? totalAccuracy / totalPrompts : 0; + + const evalsImproved = snapshotEntries.filter((s) => s.baseline?.comparisonResult === "improved").length; + const evalsRegressed = snapshotEntries.filter((s) => s.baseline?.comparisonResult === "regressed").length; + const firstSnapshotEntry = snapshotEntries[0]; const runStatus = firstSnapshotEntry?.accuracyRunStatus || "unknown"; const commitSHA = firstSnapshotEntry?.commitSHA || "unknown"; @@ -76,11 +165,12 @@ async function generateHtmlReport(snapshotEntries: AccuracySnapshotEntry[], accu ${formatAccuracy(snapshotEntry.toolCallingAccuracy)} + - - - - - - - - - - - - + + + + + + + + + + + - - ` + + + + ` ) .join(""); const template = await readFile(HTML_TESTS_SUMMARY_TEMPLATE, "utf8"); return populateTemplate(template, { - accuracyRunId, - runStatus, - runStatusUpper: runStatus.toUpperCase(), - commitSHA, - reportGeneratedOn, - createdOn, - totalTests: String(totalPrompts), - modelsCount: String(modelsCount), - testsWithZeroAccuracy: String(testsWithZeroAccuracy.length), - averageAccuracy: formatAccuracy(averageAccuracy), - baselineCommitSHA: baselineInfo?.commitSHA || "N/A", - baselineAccuracyRunId: baselineInfo?.accuracyRunId || "N/A", - baselineCreatedOn: baselineInfo?.createdOn || "N/A", - evalsImproved: String(evalsImproved), - evalsRegressed: String(evalsRegressed), + commitSHA: comparableResult.commitSHA, + accuracyRunId: comparableResult.runId, + accuracyRunStatus: formatRunStatus(comparableResult.runStatus), + reportGeneratedOn: testSummary.reportGeneratedOn, + createdOn: testSummary.resultCreatedOn, + totalTests: String(testSummary.totalPrompts), + modelsCount: String(testSummary.totalModels), + testsWithZeroAccuracy: String(testSummary.testsWithZeroAccuracy.length), + averageAccuracy: formatAccuracy(testSummary.averageAccuracy), + baselineCommitSHA: baselineInfo?.commitSHA || "-", + baselineAccuracyRunId: baselineInfo?.accuracyRunId || "-", + baselineAccuracyRunStatus: baselineInfo?.accuracyRunStatus + ? formatRunStatus(baselineInfo?.accuracyRunStatus) + : "-", + baselineCreatedOn: baselineInfo?.createdOn || "-", + evalsImproved: baselineInfo ? String(testSummary.evalsImproved) : "-", + evalsRegressed: baselineInfo ? String(testSummary.evalsRegressed) : "-", tableRows, }); } -async function generateTestSummary(): Promise { +async function generateTestSummary() { + const storage = getAccuracyResultStorage(); try { + const baselineCommit = process.env.MDB_ACCURACY_BASELINE_COMMIT; + const accuracyRunCommit = await getCommitSHA(); const accuracyRunId = process.env.MDB_ACCURACY_RUN_ID; - const baselineCommitSHA = process.env.MDB_ACCURACY_BASELINE_COMMIT; - if (!accuracyRunId) { - throw new Error("Cannot generate test summary, accuracy run id is unknown"); + if (!accuracyRunCommit) { + throw new Error("Cannot generate summary without accuracyRunCommit"); } - console.log(`\n📊 Generating test summary for accuracy run: ${accuracyRunId}\n`); - - const storage = await getAccuracySnapshotStorage(); - const currentSnapshot = await storage.getSnapshotForAccuracyRun(accuracyRunId); - if (currentSnapshot.length === 0) { - console.log("No snapshot entries found for the current run."); - await storage.close(); - return; + const accuracyRunResult = await storage.getAccuracyResult(accuracyRunCommit, accuracyRunId); + if (!accuracyRunResult) { + throw new Error( + `No accuracy run result found for commitSHA - ${accuracyRunCommit}, runId - ${accuracyRunId}` + ); } - let snapshotWithBaseline: SnapshotEntryWithBaseline[] = currentSnapshot; - let baselineInfo: { commitSHA: string; accuracyRunId: string; createdOn: string } | undefined; - - if (baselineCommitSHA) { - console.log(`🔍 Fetching baseline snapshot entries for commit: ${baselineCommitSHA}`); - const baselineSnapshot = await storage.getLatestSnapshotForCommit(baselineCommitSHA); - - if (baselineSnapshot.length > 0) { - console.log(`✅ Found ${baselineSnapshot.length} baseline snapshot entries.`); - snapshotWithBaseline = compareSnapshotEntries(currentSnapshot, baselineSnapshot); - - const firstBaselineSnapshot = baselineSnapshot[0]; - if (firstBaselineSnapshot) { - baselineInfo = { - commitSHA: firstBaselineSnapshot.commitSHA, - accuracyRunId: firstBaselineSnapshot.accuracyRunId, - createdOn: firstBaselineSnapshot.createdOn - ? new Date(firstBaselineSnapshot.createdOn).toLocaleString() - : "unknown", - }; + const baselineAccuracyRunResult = baselineCommit ? await storage.getAccuracyResult(baselineCommit) : null; + const baselineInfo: BaselineRunInfo | null = + baselineCommit && baselineAccuracyRunResult + ? { + commitSHA: baselineCommit, + accuracyRunId: baselineAccuracyRunResult.runId, + accuracyRunStatus: baselineAccuracyRunResult.runStatus, + createdOn: new Date(baselineAccuracyRunResult.createdOn).toLocaleString(), + } + : null; + + const comparableAccuracyResult: ComparableAccuracyResult = { + ...accuracyRunResult, + promptAndModelResponses: accuracyRunResult.promptResults.flatMap( + (currentPromptResult) => { + const baselinePromptResult = baselineAccuracyRunResult?.promptResults.find((baselineResult) => { + return baselineResult.prompt === currentPromptResult.prompt; + }); + + return currentPromptResult.modelResponses.map((currentModelResponse) => { + const baselineModelResponse = baselinePromptResult?.modelResponses.find( + (baselineModelResponse) => { + return ( + baselineModelResponse.provider === currentModelResponse.provider && + baselineModelResponse.requestedModel === currentModelResponse.requestedModel + ); + } + ); + return { + ...currentModelResponse, + prompt: currentPromptResult.prompt, + baselineToolAccuracy: baselineModelResponse?.toolCallingAccuracy, + }; + }); } - } else { - console.log(`⚠️ No baseline snapshots found for commit: ${baselineCommitSHA}`); - } - } - - const htmlReport = await generateHtmlReport(snapshotWithBaseline, accuracyRunId, baselineInfo); - await storage.close(); + ), + }; - const reportPath = HTML_TESTS_SUMMARY_FILE; - await writeFile(reportPath, htmlReport, "utf8"); + console.log(`\n📊 Generating test summary for accuracy run: ${accuracyRunId}\n`); + const testSummary = getTestSummary(comparableAccuracyResult); + const htmlReport = await generateHtmlReport(comparableAccuracyResult, testSummary, baselineInfo); - console.log(`✅ HTML report generated: ${reportPath}`); + await writeFile(HTML_TESTS_SUMMARY_FILE, htmlReport, "utf8"); - const totalPrompts = snapshotWithBaseline.length; - const modelsCount = new Set(snapshotWithBaseline.map((s) => `${s.provider} ${s.requestedModel}`)).size; - const testsWithZeroAccuracy = snapshotWithBaseline.filter( - (snapshotEntry) => snapshotEntry.toolCallingAccuracy === 0 - ); - const evalsImproved = snapshotWithBaseline.filter((s) => s.baseline?.comparisonResult === "improved").length; - const evalsRegressed = snapshotWithBaseline.filter((s) => s.baseline?.comparisonResult === "regressed").length; + console.log(`✅ HTML report generated: ${HTML_TESTS_SUMMARY_FILE}`); console.log(`\n📈 Summary:`); - console.log(` Total prompts evaluated: ${totalPrompts}`); - console.log(` Models tested: ${modelsCount}`); - console.log(` Evals with 0% accuracy: ${testsWithZeroAccuracy.length}`); - - if (baselineCommitSHA) { - console.log(` Baseline commit: ${baselineCommitSHA}`); - console.log(` Evals improved vs baseline: ${evalsImproved}`); - console.log(` Evals regressed vs baseline: ${evalsRegressed}`); + console.log(` Total prompts evaluated: ${testSummary.totalPrompts}`); + console.log(` Models tested: ${testSummary.totalModels}`); + console.log(` Evals with 0% accuracy: ${testSummary.testsWithZeroAccuracy.length}`); + + if (baselineCommit) { + console.log(` Baseline commit: ${baselineCommit}`); + console.log(` Evals improved vs baseline: ${testSummary.evalsImproved}`); + console.log(` Evals regressed vs baseline: ${testSummary.evalsRegressed}`); } } catch (error) { console.error("Error generating test summary:", error); process.exit(1); + } finally { + await storage.close(); } } diff --git a/tests/accuracy/create-collection.test.ts b/tests/accuracy/create-collection.test.ts index db7f888c..89d6980d 100644 --- a/tests/accuracy/create-collection.test.ts +++ b/tests/accuracy/create-collection.test.ts @@ -1,7 +1,7 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; +import { ExpectedToolCall } from "./sdk/accuracy-result-storage/result-storage.js"; function callsCreateCollection(prompt: string, database: string, collection: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/drop-collection.test.ts b/tests/accuracy/drop-collection.test.ts index 98ba3348..a9f2494c 100644 --- a/tests/accuracy/drop-collection.test.ts +++ b/tests/accuracy/drop-collection.test.ts @@ -1,7 +1,7 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; +import { ExpectedToolCall } from "./sdk/accuracy-result-storage/result-storage.js"; function onlyCallsDropCollection(prompt: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/drop-database.test.ts b/tests/accuracy/drop-database.test.ts index 53fc7fd5..74876658 100644 --- a/tests/accuracy/drop-database.test.ts +++ b/tests/accuracy/drop-database.test.ts @@ -1,7 +1,7 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; +import { ExpectedToolCall } from "./sdk/accuracy-result-storage/result-storage.js"; function onlyCallsDropDatabase(prompt: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/logs.test.ts b/tests/accuracy/logs.test.ts index 8b9d2193..2dbe8d09 100644 --- a/tests/accuracy/logs.test.ts +++ b/tests/accuracy/logs.test.ts @@ -1,7 +1,7 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; +import { ExpectedToolCall } from "./sdk/accuracy-result-storage/result-storage.js"; function callsLogsTool(prompt: string, toolCall: ExpectedToolCall): AccuracyTestConfig { return { diff --git a/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts new file mode 100644 index 00000000..204a553b --- /dev/null +++ b/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts @@ -0,0 +1,169 @@ +import path from "path"; +import fs from "fs/promises"; +import { ACCURACY_RESULTS_DIR, LATEST_ACCURACY_RUN_NAME } from "../constants.js"; +import { + AccuracyResult, + AccuracyResultStorage, + AccuracyRunStatus, + AccuracyRunStatuses, + ModelResponse, +} from "./result-storage.js"; + +export class DiskBasedResultStorage implements AccuracyResultStorage { + async getAccuracyResult(commitSHA: string, runId?: string): Promise { + const filePath = runId + ? // If we have both commit and runId then we get the path for + // specific file. Common case when saving prompt responses during an + // accuracy run + this.getAccuracyResultFilePath(commitSHA, runId) + : // If we only have commit then we grab the latest successful run for the + // commit. The latest run is a link to the last run that was + // marked as successful. + this.getAccuracyResultFilePath(commitSHA, LATEST_ACCURACY_RUN_NAME); + + try { + const raw = await fs.readFile(filePath, "utf8"); + return JSON.parse(raw) as AccuracyResult; + } catch (error) { + if ((error as { code: string }).code === "ENOENT") { + return null; + } + throw error; + } + } + + async updateRunStatus(commitSHA: string, runId: string, status: AccuracyRunStatuses): Promise { + await this.atomicWriteResult(commitSHA, runId, async () => { + const accuracyResult = await this.getAccuracyResult(commitSHA, runId); + if (!accuracyResult) { + throw new Error( + `Cannot update run status to ${status} for commit - ${commitSHA}, runId - ${runId}. Results not found!` + ); + } + + return { + ...accuracyResult, + runStatus: status, + }; + }); + + // This bit is important to mark the current run as the latest run for a + // commit so that we can use that during baseline comparison. + if (status === AccuracyRunStatus.Done) { + await this.atomicUpdateLink( + this.getAccuracyResultFilePath(commitSHA, runId), + this.getLatestResultFilePath(commitSHA) + ); + } + } + + async saveModelResponseForPrompt( + commitSHA: string, + runId: string, + prompt: string, + modelResponse: ModelResponse + ): Promise { + await this.atomicWriteResult(commitSHA, runId, async () => { + const accuracyResult = await this.getAccuracyResult(commitSHA, runId); + if (!accuracyResult) { + return { + runId, + runStatus: AccuracyRunStatus.InProgress, + createdOn: Date.now(), + commitSHA, + promptResults: [ + { + prompt, + modelResponses: [modelResponse], + }, + ], + }; + } + + const existingPromptIdx = accuracyResult.promptResults.findIndex((result) => result.prompt === prompt); + const promptResult = accuracyResult.promptResults[existingPromptIdx]; + if (!promptResult) { + return { + ...accuracyResult, + promptResults: [ + ...accuracyResult.promptResults, + { + prompt, + modelResponses: [modelResponse], + }, + ], + }; + } + + accuracyResult.promptResults.splice(existingPromptIdx, 1, { + prompt: promptResult.prompt, + modelResponses: [...promptResult.modelResponses, modelResponse], + }); + + return accuracyResult; + }); + } + + close(): Promise { + return Promise.resolve(); + } + + private async atomicWriteResult( + commitSHA: string, + runId: string, + generateResult: () => Promise + ): Promise { + for (let attempt = 0; attempt < 10; attempt++) { + // This should happen outside the try catch to let the result + // generation error bubble up. + const result = await generateResult(); + const resultFilePath = this.getAccuracyResultFilePath(commitSHA, runId); + try { + const tmp = `${resultFilePath}~${Date.now()}`; + await fs.writeFile(tmp, JSON.stringify(result, null, 2)); + await fs.rename(tmp, resultFilePath); + return; + } catch (error) { + if ((error as { code: string }).code === "ENOENT") { + const baseDir = path.dirname(resultFilePath); + await fs.mkdir(baseDir, { recursive: true }); + } + + if (attempt < 10) { + await this.waitFor(100 + Math.random() * 200); + } else { + throw error; + } + } + } + } + + private async atomicUpdateLink(filePath: string, linkPath: string) { + for (let attempt = 0; attempt < 10; attempt++) { + try { + const tempLinkPath = `${linkPath}~${Date.now()}`; + await fs.link(filePath, tempLinkPath); + await fs.rename(tempLinkPath, linkPath); + return; + } catch (error) { + if (attempt < 10) { + await this.waitFor(100 + Math.random() * 200); + } else { + throw error; + } + } + } + } + + private getAccuracyResultFilePath(commitSHA: string, runId: string): string { + return path.join(ACCURACY_RESULTS_DIR, commitSHA, `${runId}.json`); + } + + private getLatestResultFilePath(commitSHA: string): string { + return path.join(ACCURACY_RESULTS_DIR, commitSHA, `${LATEST_ACCURACY_RUN_NAME}.json`); + } + + private waitFor(ms: number) { + return new Promise((resolve) => setTimeout(resolve, ms)); + } +} diff --git a/tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.ts new file mode 100644 index 00000000..390ca231 --- /dev/null +++ b/tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.ts @@ -0,0 +1,10 @@ +import { DiskBasedResultStorage } from "./disk-storage.js"; +import { MongoDBBasedResultStorage } from "./mongodb-storage.js"; +import { AccuracyResultStorage } from "./result-storage.js"; + +export function getAccuracyResultStorage(): AccuracyResultStorage { + if (process.env.MDB_ACCURACY_MDB_URL) { + return new MongoDBBasedResultStorage(process.env.MDB_ACCURACY_MDB_URL); + } + return new DiskBasedResultStorage(); +} diff --git a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts new file mode 100644 index 00000000..000dce48 --- /dev/null +++ b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts @@ -0,0 +1,103 @@ +import { Collection, MongoClient } from "mongodb"; +import { + AccuracyResult, + AccuracyResultStorage, + AccuracyRunStatus, + AccuracyRunStatuses, + ModelResponse, +} from "./result-storage.js"; + +export class MongoDBBasedResultStorage implements AccuracyResultStorage { + private client: MongoClient; + private resultCollection: Collection; + + constructor( + connectionString: string, + // Omitting these as they might contain large chunk of texts + private readonly omittedModelResponseFields: (keyof ModelResponse)[] = ["messages", "text"] + ) { + this.client = new MongoClient(connectionString); + this.resultCollection = this.client.db("mongodb-mcp-server").collection("accuracy-results"); + } + + async getAccuracyResult(commitSHA: string, runId?: string): Promise { + const filters: Partial = runId + ? { commitSHA, runId } + : // Note that we use the `Done` status filter only when asked for + // a commit. That is because the one use case of asking for a run + // for commit is when you want the last successful run of that + // particular commit. + { commitSHA, runStatus: AccuracyRunStatus.Done }; + return await this.resultCollection.findOne(filters, { + sort: { + createdOn: -1, + }, + }); + } + + async updateRunStatus(commitSHA: string, runId: string, status: AccuracyRunStatuses): Promise { + await this.resultCollection.updateOne( + { commitSHA, runId }, + { + $set: { + runStatus: status, + }, + } + ); + } + + async saveModelResponseForPrompt( + commitSHA: string, + runId: string, + prompt: string, + modelResponse: ModelResponse + ): Promise { + const savedModelResponse: ModelResponse = { ...modelResponse }; + for (const field of this.omittedModelResponseFields) { + delete savedModelResponse[field]; + } + + await this.resultCollection.updateOne( + { commitSHA, runId }, + { + $setOnInsert: { + runStatus: AccuracyRunStatus.InProgress, + createdOn: Date.now(), + commitSHA, + runId, + promptResults: [], + }, + }, + { upsert: true } + ); + + await this.resultCollection.updateOne( + { + commitSHA, + runId, + "promptResults.prompt": { $ne: prompt }, + }, + { + $push: { + promptResults: { prompt, modelResponses: [] }, + }, + } + ); + + await this.resultCollection.updateOne( + { commitSHA, runId }, + { + $push: { + "promptResults.$[promptElement].modelResponses": savedModelResponse, + }, + }, + { + arrayFilters: [{ "promptElement.prompt": prompt }], + } + ); + } + + async close(): Promise { + await this.client.close(); + } +} diff --git a/tests/accuracy/sdk/accuracy-result-storage/result-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/result-storage.ts new file mode 100644 index 00000000..737ee32d --- /dev/null +++ b/tests/accuracy/sdk/accuracy-result-storage/result-storage.ts @@ -0,0 +1,116 @@ +export interface LLMToolCall { + toolCallId: string; + toolName: string; + parameters: Record; +} + +export type ExpectedToolCall = Omit; + +export const AccuracyRunStatus = { + Done: "done", + Failed: "failed", + InProgress: "in-progress", +} as const; + +export type AccuracyRunStatuses = (typeof AccuracyRunStatus)[keyof typeof AccuracyRunStatus]; + +export interface AccuracyResult { + /** + * A unique id for each accuracy run. Should either be generated by the + * script triggering the accuracy run or provided via environment variables. + * */ + runId: string; + /** + * Represents the status of accuracy run. Each test completion, during an + * accuracy run, is supposed to submit an accuracy result entry with + * InProgress status which then later, after completion of accuracy run, is + * updated to either Done or Failed, depending on whether there were errors + * during the run or not. */ + runStatus: AccuracyRunStatuses; + /** + * Timestamp of when this result entry was generated. */ + createdOn: number; + /** + * The commit SHA for which the accuracy run was triggered. */ + commitSHA: string; + /** + * A list of results for different prompts tested in the accuracy run. */ + promptResults: PromptResult[]; +} + +export interface PromptResult { + /** + * The actual prompt that was provided to LLM as test */ + prompt: string; + /** + * The responses from the LLMs tested, when provided with the prompt. */ + modelResponses: ModelResponse[]; +} + +export interface ModelResponse { + /** + * The LLM provider providing the LLM APIs */ + provider: string; + /** + * The LLM which was requested to respond to our test prompts */ + requestedModel: string; + /** + * The ID of the model that actually responded to our prompt request. */ + respondingModel: string; + /** + * The total time taken by LLM to respond to our prompt. */ + llmResponseTime: number; + /** + * A number between 0 and 1, representing how accurately the expected tools + * were called by LLM when responding to the provided prompts. To know more + * about how this number is generated, check - toolCallingAccuracy.ts */ + toolCallingAccuracy: number; + /** + * A list of tools, along with their parameters, that are expected to be + * called by the LLM in test. */ + expectedToolCalls: ExpectedToolCall[]; + /** + * A list of tools, along with their parameters, that were actually called + * by the LLM in test. */ + llmToolCalls: LLMToolCall[]; + /** + * Token usage data, returned as part of LLM prompt response. */ + tokensUsed?: TokensUsed; + /** + * The final response text generated by the LLM, in response to our prompt + * request. */ + text?: string; + /** + * A list of messages, exchanged between LLM and our testing agent, in + * response to our prompt request. This is particularly helpful for + * debugging. */ + messages?: Record[]; +} + +interface TokensUsed { + promptTokens?: number; + completionTokens?: number; + totalTokens?: number; +} + +export interface AccuracyResultStorage { + /** + * Retrieves the accuracy result for the provided commit SHA and optionally + * the run id. When the run id is omitted, the implementation fetches the + * result for the last successful accuracy run otherwise it fetches the + * result regardless of the run status. */ + getAccuracyResult(commitSHA: string, runId?: string): Promise; + /** + * Updates the status of the run */ + updateRunStatus(commitSHA: string, runId: string, status: AccuracyRunStatuses): Promise; + /** + * Attempts to atomically insert the model response for the prompt in the + * stored accuracy result. */ + saveModelResponseForPrompt( + commitSHA: string, + runId: string, + prompt: string, + modelResponse: ModelResponse + ): Promise; + close(): Promise; +} diff --git a/tests/accuracy/sdk/accuracy-scorer.ts b/tests/accuracy/sdk/accuracy-scorer.ts index 2ae13e6c..261f48dc 100644 --- a/tests/accuracy/sdk/accuracy-scorer.ts +++ b/tests/accuracy/sdk/accuracy-scorer.ts @@ -1,5 +1,5 @@ import diff from "microdiff"; -import { ExpectedToolCall, LLMToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; +import { ExpectedToolCall, LLMToolCall } from "./accuracy-result-storage/result-storage.js"; /** * Tool calling accuracy is a single number calculated based on two dimensions. diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts deleted file mode 100644 index a919e8f0..00000000 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts +++ /dev/null @@ -1,117 +0,0 @@ -import fs from "fs/promises"; -import { - AccuracyRunStatus, - AccuracyRunStatuses, - AccuracySnapshotEntry, - AccuracySnapshotEntrySchema, - AccuracySnapshotStorage, -} from "./snapshot-storage.js"; -import { GENERATED_ASSETS_DIR, LOCAL_SNAPSHOTS_FILE } from "../constants.js"; - -export class DiskSnapshotStorage implements AccuracySnapshotStorage { - async createSnapshotEntry( - snapshotEntry: Pick< - AccuracySnapshotEntry, - | "accuracyRunId" - | "commitSHA" - | "provider" - | "requestedModel" - | "prompt" - | "toolCallingAccuracy" - | "expectedToolCalls" - | "actualToolCalls" - | "llmResponseTime" - | "tokensUsage" - | "respondingModel" - | "text" - | "messages" - > - ): Promise { - const snapshotWithMeta: AccuracySnapshotEntry = { - ...snapshotEntry, - accuracyRunStatus: AccuracyRunStatus.InProgress, - createdOn: Date.now(), - }; - - await this.appendAccuracySnapshot(snapshotWithMeta); - } - - async getLatestSnapshotForCommit(commit: string): Promise { - const snapshot = await this.readSnapshot(); - const entries = snapshot - .filter((entry) => { - return entry.commitSHA === commit && entry.accuracyRunStatus === AccuracyRunStatus.Done; - }) - .sort((a, b) => b.createdOn - a.createdOn); - const latestRunId = entries[0]?.accuracyRunId; - return latestRunId ? snapshot.filter((entry) => entry.accuracyRunId === latestRunId) : []; - } - - async getSnapshotForAccuracyRun(accuracyRunId: string): Promise { - const snapshot = await this.readSnapshot(); - return snapshot.filter((entry) => entry.accuracyRunId === accuracyRunId); - } - - async updateAccuracyRunStatus(accuracyRunId: string, status: AccuracyRunStatuses) { - const snapshot = await this.readSnapshot(); - const updatedSnapshot = snapshot.map((entry) => { - if (entry.accuracyRunId === accuracyRunId) { - return { - ...entry, - accuracyRunStatus: status, - }; - } - - return entry; - }); - await this.writeSnapshot(updatedSnapshot); - } - - close(): Promise { - return Promise.resolve(); - } - - private async appendAccuracySnapshot(entry: AccuracySnapshotEntry): Promise { - for (let attempt = 0; attempt < 5; attempt++) { - try { - const snapshot = await this.readSnapshot(); - snapshot.unshift(entry); - await this.writeSnapshot(snapshot); - return; - } catch (e) { - if (attempt < 4) { - await this.waitFor(100 + Math.random() * 200); - } else { - throw e; - } - } - } - } - - private async writeSnapshot(snapshot: AccuracySnapshotEntry[]): Promise { - const tmp = `${LOCAL_SNAPSHOTS_FILE}~${Date.now()}`; - await fs.writeFile(tmp, JSON.stringify(snapshot, null, 2)); - await fs.rename(tmp, LOCAL_SNAPSHOTS_FILE); - } - - private async readSnapshot(): Promise { - try { - const raw = await fs.readFile(LOCAL_SNAPSHOTS_FILE, "utf8"); - return AccuracySnapshotEntrySchema.array().parse(JSON.parse(raw)); - } catch (e: unknown) { - if ((e as { code: string }).code === "ENOENT") { - return []; - } - throw e; - } - } - - private waitFor(ms: number) { - return new Promise((resolve) => setTimeout(resolve, ms)); - } - - static async getStorage() { - await fs.mkdir(GENERATED_ASSETS_DIR, { recursive: true }); - return new DiskSnapshotStorage(); - } -} diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts deleted file mode 100644 index da67aa60..00000000 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts +++ /dev/null @@ -1,7 +0,0 @@ -import { DiskSnapshotStorage } from "./disk-snapshot-storage.js"; -import { MongoDBSnapshotStorage } from "./mdb-snapshot-storage.js"; -import { AccuracySnapshotStorage } from "./snapshot-storage.js"; - -export async function getAccuracySnapshotStorage(): Promise { - return MongoDBSnapshotStorage.getStorage() ?? (await DiskSnapshotStorage.getStorage()); -} diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts deleted file mode 100644 index a3915fdc..00000000 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts +++ /dev/null @@ -1,82 +0,0 @@ -import { Collection, MongoClient } from "mongodb"; -import { - AccuracyRunStatus, - AccuracyRunStatuses, - AccuracySnapshotEntry, - AccuracySnapshotEntrySchema, - AccuracySnapshotStorage, -} from "./snapshot-storage.js"; - -export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { - private readonly client: MongoClient; - private readonly snapshotCollection: Collection; - private constructor(mongodbUrl: string) { - this.client = new MongoClient(mongodbUrl); - this.snapshotCollection = this.client.db("mongodb-mcp-server").collection("accuracy-tests"); - } - - async createSnapshotEntry( - snapshotEntry: Pick< - AccuracySnapshotEntry, - | "accuracyRunId" - | "commitSHA" - | "provider" - | "requestedModel" - | "prompt" - | "toolCallingAccuracy" - | "expectedToolCalls" - | "actualToolCalls" - | "llmResponseTime" - | "tokensUsage" - | "respondingModel" - | "text" - | "messages" - > - ): Promise { - const snapshotWithMeta: AccuracySnapshotEntry = { - ...snapshotEntry, - accuracyRunStatus: AccuracyRunStatus.InProgress, - createdOn: Date.now(), - }; - await this.snapshotCollection.insertOne(snapshotWithMeta); - } - - async getLatestSnapshotForCommit(commit: string): Promise { - const latestRunId = await this.getLatestAccuracyRunForCommit(commit); - return latestRunId ? this.getSnapshotForAccuracyRun(latestRunId) : []; - } - - async getSnapshotForAccuracyRun(accuracyRunId: string): Promise { - const snapshotEntries = await this.snapshotCollection.find({ accuracyRunId }).toArray(); - return AccuracySnapshotEntrySchema.array().parse(snapshotEntries); - } - - private async getLatestAccuracyRunForCommit(commit: string): Promise { - const document = await this.snapshotCollection.findOne( - { commitSHA: commit, accuracyRunStatus: AccuracyRunStatus.Done }, - { sort: { createdOn: -1 }, projection: { accuracyRunId: 1 } } - ); - - return document?.accuracyRunId ? `${document?.accuracyRunId}` : undefined; - } - - async updateAccuracyRunStatus(accuracyRunId: string, status: AccuracyRunStatuses) { - await this.snapshotCollection.updateMany( - { accuracyRunId: accuracyRunId }, - { $set: { accuracyRunStatus: status } } - ); - } - - async close(): Promise { - await this.client.close(); - } - - static getStorage(): MongoDBSnapshotStorage | null { - const mongodbUrl = process.env.MDB_ACCURACY_MDB_URL; - if (!mongodbUrl) { - return null; - } - - return new MongoDBSnapshotStorage(mongodbUrl); - } -} diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts deleted file mode 100644 index e0a6966d..00000000 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts +++ /dev/null @@ -1,127 +0,0 @@ -import z from "zod"; - -const LLMToolCallSchema = z.object({ - toolCallId: z.string(), - toolName: z.string(), - parameters: z.record(z.string(), z.unknown()), -}); -export type LLMToolCall = z.infer; - -const ExpectedToolCallSchema = LLMToolCallSchema.omit({ toolCallId: true }); -export type ExpectedToolCall = z.infer; - -export const AccuracyRunStatus = { - Done: "done", - Failed: "failed", - InProgress: "in-progress", -} as const; - -export type AccuracyRunStatuses = (typeof AccuracyRunStatus)[keyof typeof AccuracyRunStatus]; - -export const AccuracySnapshotEntrySchema = z.object({ - /** - * A unique id for each accuracy run. Should either be generated by the - * script triggering the accuracy run or provided via environment variables. - * */ - accuracyRunId: z.string(), - - /** - * Represents the status of accuracy run. Each test completion, during an - * accuracy run, is supposed to submit an accuracy snapshot entry with - * InProgress status which then later, after completion of accuracy run, is - * updated to either Done or Failed, depending on whether there were errors - * during the run or not. */ - accuracyRunStatus: z - .enum([AccuracyRunStatus.Done, AccuracyRunStatus.Failed, AccuracyRunStatus.InProgress]) - .default(AccuracyRunStatus.InProgress), - - /** Timestamp of when this snapshot entry was generated. */ - createdOn: z.number(), - - /** The commit SHA for which the accuracy run was triggered. */ - commitSHA: z.string(), - - /** The LLM provider providing the LLM APIs */ - provider: z.string(), - - /** The LLM which was requested to respond to our test prompts */ - requestedModel: z.string(), - - /** The actual prompt that was provided to LLM as test */ - prompt: z.string(), - - /** A number between 0 and 1, representing how accurately the expected tools - * were called by LLM when responding to the provided prompts. To know more - * about how this number is generated, check - toolCallingAccuracy.ts */ - toolCallingAccuracy: z.number(), - - /** - * A list of tools, along with their parameters, that are expected to be - * called by the LLM in test. */ - expectedToolCalls: ExpectedToolCallSchema.array(), - - /** - * A list of tools, along with their parameters, that were actually called - * by the LLM in test. */ - actualToolCalls: LLMToolCallSchema.array(), - - /** - * The total time taken by LLM to respond to our prompt. */ - llmResponseTime: z.number(), - - /** - * Token usage data, returned as part of LLM prompt response. */ - tokensUsage: z - .object({ - promptTokens: z.number().optional(), - completionTokens: z.number().optional(), - totalTokens: z.number().optional(), - }) - .optional(), - - /** - * The ID of the model that actually responded to our prompt request. */ - respondingModel: z.string(), - - /** - * The final response text generated by the LLM, in response to our prompt - * request. */ - text: z.string(), - - /** - * A list of messages, exchanged between LLM and our testing agent, in - * response to our prompt request. This is particularly helpful for - * debugging. */ - messages: z.array(z.record(z.string(), z.unknown())), -}); - -export type AccuracySnapshotEntry = z.infer; - -export interface AccuracySnapshotStorage { - createSnapshotEntry( - snapshotEntry: Pick< - AccuracySnapshotEntry, - | "accuracyRunId" - | "commitSHA" - | "provider" - | "requestedModel" - | "prompt" - | "toolCallingAccuracy" - | "expectedToolCalls" - | "actualToolCalls" - | "llmResponseTime" - | "tokensUsage" - | "respondingModel" - | "text" - | "messages" - > - ): Promise; - - getLatestSnapshotForCommit(commit: string): Promise; - - getSnapshotForAccuracyRun(accuracyRunId: string): Promise; - - updateAccuracyRunStatus(accuracyRunId: string, status: AccuracyRunStatuses): Promise; - - close(): Promise; -} diff --git a/tests/accuracy/sdk/accuracy-testing-client.ts b/tests/accuracy/sdk/accuracy-testing-client.ts index d2486942..25a224a1 100644 --- a/tests/accuracy/sdk/accuracy-testing-client.ts +++ b/tests/accuracy/sdk/accuracy-testing-client.ts @@ -4,7 +4,7 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; import { MCP_SERVER_CLI_SCRIPT } from "./constants.js"; -import { LLMToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; +import { LLMToolCall } from "./accuracy-result-storage/result-storage.js"; type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult | Promise; export type MockedTools = Record; diff --git a/tests/accuracy/sdk/constants.ts b/tests/accuracy/sdk/constants.ts index 0598b1a7..0c70a9e9 100644 --- a/tests/accuracy/sdk/constants.ts +++ b/tests/accuracy/sdk/constants.ts @@ -15,7 +15,9 @@ export const TEST_DATA_DUMPS_DIR = path.join(__dirname, "test-data-dumps"); export const GENERATED_ASSETS_DIR = path.join(ROOT_DIR, ".accuracy"); -export const LOCAL_SNAPSHOTS_FILE = path.join(GENERATED_ASSETS_DIR, "snapshots.json"); +export const ACCURACY_RESULTS_DIR = path.join(GENERATED_ASSETS_DIR, "results"); + +export const LATEST_ACCURACY_RUN_NAME = "latest-run"; export const HTML_TESTS_SUMMARY_FILE = path.join(GENERATED_ASSETS_DIR, "tests-summary.html"); diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 2a358ce1..b964cd48 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -3,8 +3,8 @@ import { calculateToolCallingAccuracy } from "./accuracy-scorer.js"; import { getVercelToolCallingAgent, VercelAgent } from "./agent.js"; import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/tools/mongodb/mongodbHelpers.js"; import { AccuracyTestingClient, MockedTools } from "./accuracy-testing-client.js"; -import { getAccuracySnapshotStorage } from "./accuracy-snapshot-storage/get-snapshot-storage.js"; -import { AccuracySnapshotStorage, ExpectedToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; +import { AccuracyResultStorage, ExpectedToolCall } from "./accuracy-result-storage/result-storage.js"; +import { getAccuracyResultStorage } from "./accuracy-result-storage/get-accuracy-result-storage.js"; import { getCommitSHA } from "./git-info.js"; export interface AccuracyTestConfig { @@ -57,7 +57,7 @@ export function describeAccuracyTests(models: TestableModels, accuracyTestConfig const { populateTestData, cleanupTestDatabases } = prepareTestData(mdbIntegration); let commitSHA: string; - let accuracySnapshotStorage: AccuracySnapshotStorage; + let accuracyResultStorage: AccuracyResultStorage; let testMCPClient: AccuracyTestingClient; let agent: VercelAgent; @@ -68,7 +68,7 @@ export function describeAccuracyTests(models: TestableModels, accuracyTestConfig } commitSHA = retrievedCommitSHA; - accuracySnapshotStorage = await getAccuracySnapshotStorage(); + accuracyResultStorage = getAccuracyResultStorage(); testMCPClient = await AccuracyTestingClient.initializeClient(mdbIntegration.connectionString()); agent = getVercelToolCallingAgent(); }); @@ -80,7 +80,7 @@ export function describeAccuracyTests(models: TestableModels, accuracyTestConfig }); afterAll(async () => { - await accuracySnapshotStorage?.close(); + await accuracyResultStorage?.close(); await testMCPClient?.close(); }); @@ -102,17 +102,17 @@ export function describeAccuracyTests(models: TestableModels, accuracyTestConfig const toolCallingAccuracy = calculateToolCallingAccuracy(testConfig.expectedToolCalls, llmToolCalls); const responseTime = timeAfterPrompt - timeBeforePrompt; - await accuracySnapshotStorage.createSnapshotEntry({ - accuracyRunId, - commitSHA, + await accuracyResultStorage.saveModelResponseForPrompt(commitSHA, accuracyRunId, testConfig.prompt, { provider: model.provider, requestedModel: model.modelName, - prompt: testConfig.prompt, + respondingModel: result.respondingModel, llmResponseTime: responseTime, toolCallingAccuracy: toolCallingAccuracy, - actualToolCalls: llmToolCalls, expectedToolCalls: testConfig.expectedToolCalls, - ...result, + llmToolCalls: llmToolCalls, + tokensUsed: result.tokensUsage, + text: result.text, + messages: result.messages, }); }); }); diff --git a/tests/unit/accuracy-scorer.test.ts b/tests/unit/accuracy-scorer.test.ts index 60a389d7..cb844686 100644 --- a/tests/unit/accuracy-scorer.test.ts +++ b/tests/unit/accuracy-scorer.test.ts @@ -1,5 +1,5 @@ import { calculateToolCallingAccuracy } from "../accuracy/sdk/accuracy-scorer.js"; -import { ExpectedToolCall, LLMToolCall } from "../accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; +import { ExpectedToolCall, LLMToolCall } from "../accuracy/sdk/accuracy-result-storage/result-storage.js"; describe("calculateToolCallingAccuracy", () => { describe("edge cases", () => { From fe47c61d093e3470700cf1f3e4e70b41e6796a2b Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 14 Jul 2025 01:36:47 +0200 Subject: [PATCH 50/91] chore: move accuracy scripts inside accuracy --- scripts/{ => accuracy}/generate-test-summary.ts | 8 ++++---- scripts/accuracy/run-accuracy-tests.sh | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) rename scripts/{ => accuracy}/generate-test-summary.ts (97%) diff --git a/scripts/generate-test-summary.ts b/scripts/accuracy/generate-test-summary.ts similarity index 97% rename from scripts/generate-test-summary.ts rename to scripts/accuracy/generate-test-summary.ts index 78eadd91..6b14e264 100644 --- a/scripts/generate-test-summary.ts +++ b/scripts/accuracy/generate-test-summary.ts @@ -1,14 +1,14 @@ import { readFile, writeFile } from "fs/promises"; -import { getAccuracyResultStorage } from "../tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.js"; +import { getAccuracyResultStorage } from "../../tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.js"; import { AccuracyResult, AccuracyRunStatuses, ExpectedToolCall, LLMToolCall, ModelResponse, -} from "../tests/accuracy/sdk/accuracy-result-storage/result-storage.js"; -import { getCommitSHA } from "../tests/accuracy/sdk/git-info.js"; -import { HTML_TESTS_SUMMARY_FILE, HTML_TESTS_SUMMARY_TEMPLATE } from "../tests/accuracy/sdk/constants.js"; +} from "../../tests/accuracy/sdk/accuracy-result-storage/result-storage.js"; +import { getCommitSHA } from "../../tests/accuracy/sdk/git-info.js"; +import { HTML_TESTS_SUMMARY_FILE, HTML_TESTS_SUMMARY_TEMPLATE } from "../../tests/accuracy/sdk/constants.js"; type ComparableAccuracyResult = Omit & { promptAndModelResponses: PromptAndModelResponse[]; diff --git a/scripts/accuracy/run-accuracy-tests.sh b/scripts/accuracy/run-accuracy-tests.sh index 10ae6192..2332457f 100644 --- a/scripts/accuracy/run-accuracy-tests.sh +++ b/scripts/accuracy/run-accuracy-tests.sh @@ -40,6 +40,6 @@ export MDB_ACCURACY_RUN_STATUS=$([ $JEST_EXIT_CODE -eq 0 ] && echo "done" || ech npx tsx scripts/accuracy/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to '$MDB_ACCURACY_RUN_STATUS'" # This is optional but we do it anyways to generate a readable summary of report. -npx tsx scripts/generate-test-summary.ts || echo "Warning: Failed to generate test summary HTML report" +npx tsx scripts/accuracy/generate-test-summary.ts || echo "Warning: Failed to generate test summary HTML report" exit $JEST_EXIT_CODE \ No newline at end of file From 727be10c9e044a11726b4514df2fff1b8655569b Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 14 Jul 2025 01:55:20 +0200 Subject: [PATCH 51/91] chore: addresses more PR feedback 1. use commit sha for github actions 2. run workflow also on pushes to main 3. use ai-sdk/google instead of privately published package --- .github/workflows/accuracy-tests.yml | 8 ++- package-lock.json | 86 ++++++---------------------- package.json | 9 ++- 3 files changed, 29 insertions(+), 74 deletions(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index bd20a4c8..57c102a0 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -2,8 +2,12 @@ name: Accuracy Tests on: workflow_dispatch: + push: + branches: + - main pull_request: - types: [labeled] + types: + - labeled jobs: run-accuracy-tests: @@ -41,6 +45,6 @@ jobs: path: .accuracy/tests-summary.html - name: Comment summary on PR if: github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests' - uses: marocchino/sticky-pull-request-comment@v2 + uses: marocchino/sticky-pull-request-comment@d2ad0de260ae8b0235ce059e63f2949ba9e05943 # v2 with: path: .accuracy/tests-summary.html diff --git a/package-lock.json b/package-lock.json index 2627186b..95cf8163 100644 --- a/package-lock.json +++ b/package-lock.json @@ -31,9 +31,9 @@ "mongodb-mcp-server": "dist/index.js" }, "devDependencies": { - "@ai-sdk/anthropic": "^1.2.12", - "@ai-sdk/azure": "^1.3.23", - "@ai-sdk/openai": "^1.3.22", + "@ai-sdk/azure": "^1.3.24", + "@ai-sdk/google": "^1.2.22", + "@ai-sdk/openai": "^1.3.23", "@eslint/js": "^9.30.1", "@himanshusinghs/google": "^1.2.11", "@modelcontextprotocol/inspector": "^0.16.0", @@ -42,7 +42,7 @@ "@types/simple-oauth2": "^5.0.7", "@types/yargs-parser": "^21.0.3", "@vitest/coverage-v8": "^3.2.4", - "ai": "^4.3.16", + "ai": "^4.3.17", "eslint": "^9.30.1", "eslint-config-prettier": "^10.1.5", "eslint-plugin-prettier": "^5.5.1", @@ -68,13 +68,14 @@ "@himanshusinghs/ai-sdk-google": { "extraneous": true }, - "node_modules/@ai-sdk/anthropic": { - "version": "1.2.12", - "resolved": "https://registry.npmjs.org/@ai-sdk/anthropic/-/anthropic-1.2.12.tgz", - "integrity": "sha512-YSzjlko7JvuiyQFmI9RN1tNZdEiZxc+6xld/0tq/VkJaHpEzGAb1yiNxxvmYVcjvfu/PcvCxAAYXmTYQQ63IHQ==", + "node_modules/@ai-sdk/azure": { + "version": "1.3.24", + "resolved": "https://registry.npmjs.org/@ai-sdk/azure/-/azure-1.3.24.tgz", + "integrity": "sha512-6zOG8mwmd8esSL/L9oYFZSyZWORRTxuG6on9A3RdPe7MRJ607Q6BWsuvul79kecbLf5xQ4bfP7LzXaBizsd8OA==", "dev": true, "license": "Apache-2.0", "dependencies": { + "@ai-sdk/openai": "1.3.23", "@ai-sdk/provider": "1.1.3", "@ai-sdk/provider-utils": "2.2.8" }, @@ -85,14 +86,13 @@ "zod": "^3.0.0" } }, - "node_modules/@ai-sdk/azure": { - "version": "1.3.23", - "resolved": "https://registry.npmjs.org/@ai-sdk/azure/-/azure-1.3.23.tgz", - "integrity": "sha512-vpsaPtU24RBVk/IMM5UylR/N4RtAuL2NZLWc7LJ3tvMTHu6pI46a7w+1qIwR3F6yO9ehWR8qvfLaBefJNFxaVw==", + "node_modules/@ai-sdk/google": { + "version": "1.2.22", + "resolved": "https://registry.npmjs.org/@ai-sdk/google/-/google-1.2.22.tgz", + "integrity": "sha512-Ppxu3DIieF1G9pyQ5O1Z646GYR0gkC57YdBqXJ82qvCdhEhZHu0TWhmnOoeIWe2olSbuDeoOY+MfJrW8dzS3Hw==", "dev": true, "license": "Apache-2.0", "dependencies": { - "@ai-sdk/openai": "1.3.22", "@ai-sdk/provider": "1.1.3", "@ai-sdk/provider-utils": "2.2.8" }, @@ -104,9 +104,9 @@ } }, "node_modules/@ai-sdk/openai": { - "version": "1.3.22", - "resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-1.3.22.tgz", - "integrity": "sha512-QwA+2EkG0QyjVR+7h6FE7iOu2ivNqAVMm9UJZkVxxTk5OIq5fFJDTEI/zICEMuHImTTXR2JjsL6EirJ28Jc4cw==", + "version": "1.3.23", + "resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-1.3.23.tgz", + "integrity": "sha512-86U7rFp8yacUAOE/Jz8WbGcwMCqWvjK33wk5DXkfnAOEn3mx2r7tNSJdjukQFZbAK97VMXGPPHxF+aEARDXRXQ==", "dev": true, "license": "Apache-2.0", "dependencies": { @@ -1718,54 +1718,6 @@ "@hapi/hoek": "^11.0.2" } }, - "node_modules/@himanshusinghs/google": { - "version": "1.2.11", - "resolved": "https://registry.npmjs.org/@himanshusinghs/google/-/google-1.2.11.tgz", - "integrity": "sha512-SKTFxwN9PpUHVrppFod8sF1jqys5azzsgcBVrSbc7VaazmVEnBxHQlv5/yfeZFjD3ly5Mw+AJdFfC0bxwdWBNg==", - "dev": true, - "license": "Apache-2.0", - "dependencies": { - "@ai-sdk/provider": "1.1.2", - "@ai-sdk/provider-utils": "2.2.6" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "zod": "^3.0.0" - } - }, - "node_modules/@himanshusinghs/google/node_modules/@ai-sdk/provider": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.2.tgz", - "integrity": "sha512-ITdgNilJZwLKR7X5TnUr1BsQW6UTX5yFp0h66Nfx8XjBYkWD9W3yugr50GOz3CnE9m/U/Cd5OyEbTMI0rgi6ZQ==", - "dev": true, - "license": "Apache-2.0", - "dependencies": { - "json-schema": "^0.4.0" - }, - "engines": { - "node": ">=18" - } - }, - "node_modules/@himanshusinghs/google/node_modules/@ai-sdk/provider-utils": { - "version": "2.2.6", - "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.6.tgz", - "integrity": "sha512-sUlZ7Gnq84DCGWMQRIK8XVbkzIBnvPR1diV4v6JwPgpn5armnLI/j+rqn62MpLrU5ZCQZlDKl/Lw6ed3ulYqaA==", - "dev": true, - "license": "Apache-2.0", - "dependencies": { - "@ai-sdk/provider": "1.1.2", - "nanoid": "^3.3.8", - "secure-json-parse": "^2.7.0" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "zod": "^3.23.8" - } - }, "node_modules/@humanfs/core": { "version": "0.19.1", "resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz", @@ -5963,9 +5915,9 @@ } }, "node_modules/ai": { - "version": "4.3.16", - "resolved": "https://registry.npmjs.org/ai/-/ai-4.3.16.tgz", - "integrity": "sha512-KUDwlThJ5tr2Vw0A1ZkbDKNME3wzWhuVfAOwIvFUzl1TPVDFAXDFTXio3p+jaKneB+dKNCvFFlolYmmgHttG1g==", + "version": "4.3.17", + "resolved": "https://registry.npmjs.org/ai/-/ai-4.3.17.tgz", + "integrity": "sha512-uWqIQ94Nb1GTYtYElGHegJMOzv3r2mCKNFlKrqkft9xrfvIahTI5OdcnD5U9612RFGuUNGmSDTO1/YRNFXobaQ==", "dev": true, "license": "Apache-2.0", "dependencies": { diff --git a/package.json b/package.json index fdd48f9c..7a2906e9 100644 --- a/package.json +++ b/package.json @@ -35,18 +35,17 @@ }, "license": "Apache-2.0", "devDependencies": { - "@ai-sdk/anthropic": "^1.2.12", - "@ai-sdk/azure": "^1.3.23", - "@ai-sdk/openai": "^1.3.22", + "@ai-sdk/azure": "^1.3.24", + "@ai-sdk/openai": "^1.3.23", "@eslint/js": "^9.30.1", - "@himanshusinghs/google": "^1.2.11", + "@ai-sdk/google": "^1.2.22", "@modelcontextprotocol/inspector": "^0.16.0", "@redocly/cli": "^1.34.4", "@types/node": "^24.0.12", "@types/simple-oauth2": "^5.0.7", "@types/yargs-parser": "^21.0.3", "@vitest/coverage-v8": "^3.2.4", - "ai": "^4.3.16", + "ai": "^4.3.17", "eslint": "^9.30.1", "eslint-config-prettier": "^10.1.5", "eslint-plugin-prettier": "^5.5.1", From a0b980216b3c18e7c1c78ace5d785f0483425956 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 14 Jul 2025 01:59:12 +0200 Subject: [PATCH 52/91] chore: use @ai-sdk/google --- tests/accuracy/sdk/models.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index 9f47028f..928b7793 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -1,5 +1,5 @@ import { LanguageModelV1 } from "ai"; -import { createGoogleGenerativeAI } from "@himanshusinghs/google"; +import { createGoogleGenerativeAI } from "@ai-sdk/google"; import { createAzure } from "@ai-sdk/azure"; import { createOpenAI } from "@ai-sdk/openai"; import { ollama } from "ollama-ai-provider"; From f4ddec2f6a84a6245c4d71cc1f59b5f0d60042af Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 14 Jul 2025 02:05:07 +0200 Subject: [PATCH 53/91] chore: use npm script in ci --- .github/workflows/accuracy-tests.yml | 2 +- package.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index 57c102a0..354505bb 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -36,7 +36,7 @@ jobs: - name: Install dependencies run: npm ci - name: Run accuracy tests - run: ./scripts/run-accuracy-tests.sh + run: npm run test:accuracy - name: Upload accuracy test summary if: always() uses: actions/upload-artifact@v4 diff --git a/package.json b/package.json index 7a2906e9..c8687298 100644 --- a/package.json +++ b/package.json @@ -30,7 +30,7 @@ "reformat": "prettier --write .", "generate": "./scripts/generate.sh", "test": "vitest --coverage", - "pre:test:accuracy": "npm run build:compile", + "pretest:accuracy": "npm run build:compile", "test:accuracy": "sh ./scripts/accuracy/run-accuracy-tests.sh" }, "license": "Apache-2.0", From ea25ac5fef7301f807c9818e814dcb552e0f2c14 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 14 Jul 2025 02:14:20 +0200 Subject: [PATCH 54/91] chore: shift only when arguments are passed to the script --- scripts/accuracy/run-accuracy-tests.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/accuracy/run-accuracy-tests.sh b/scripts/accuracy/run-accuracy-tests.sh index 2332457f..99a1897f 100644 --- a/scripts/accuracy/run-accuracy-tests.sh +++ b/scripts/accuracy/run-accuracy-tests.sh @@ -14,8 +14,12 @@ export MDB_ACCURACY_RUN_ID=$(npx uuid v4) # By default we run all the tests under tests/accuracy folder unless a path is # specified in the command line. Such as: # npm run test:accuracy -- tests/accuracy/some-test.test.ts -TEST_PATH_PATTERN="${1:-tests/accuracy}" -shift || true +if [ $# -gt 0 ]; then + TEST_PATH_PATTERN="$1" + shift +else + TEST_PATH_PATTERN="tests/accuracy" +fi echo "Running accuracy tests with MDB_ACCURACY_RUN_ID '$MDB_ACCURACY_RUN_ID' and TEST_PATH_PATTERN '$TEST_PATH_PATTERN'" node --experimental-vm-modules node_modules/jest/bin/jest.js --bail --testPathPatterns "$TEST_PATH_PATTERN" "$@" From d50824d33dec0995c225e1cb335321d36fa7a619 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 14 Jul 2025 09:28:36 +0200 Subject: [PATCH 55/91] chore: azure url is on vars --- .github/workflows/accuracy-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index 354505bb..43bd7be8 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -23,7 +23,7 @@ jobs: MDB_OPEN_AI_API_KEY: ${{ secrets.ACCURACY_OPEN_AI_API_KEY }} MDB_GEMINI_API_KEY: ${{ secrets.ACCURACY_GEMINI_API_KEY }} MDB_AZURE_OPEN_AI_API_KEY: ${{ secrets.ACCURACY_AZURE_OPEN_AI_API_KEY }} - MDB_AZURE_OPEN_AI_API_URL: ${{ secrets.ACCURACY_AZURE_OPEN_AI_API_URL }} + MDB_AZURE_OPEN_AI_API_URL: ${{ vars.ACCURACY_AZURE_OPEN_AI_API_URL }} MDB_ACCURACY_MDB_URL: ${{ secrets.ACCURACY_MDB_CONNECTION_STRING }} MDB_ACCURACY_BASELINE_COMMIT: ${{ github.event.pull_request.base.sha || '' }} steps: From 772a0a37d2a489486eb61332e3826a6c44098ff6 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 14 Jul 2025 11:10:47 +0200 Subject: [PATCH 56/91] chore: use env vars for mongo namespace --- .github/workflows/accuracy-tests.yml | 2 ++ scripts/accuracy/run-accuracy-tests.sh | 2 ++ .../accuracy-result-storage/get-accuracy-result-storage.ts | 5 +++-- .../accuracy/sdk/accuracy-result-storage/mongodb-storage.ts | 4 +++- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index 43bd7be8..f7e22b61 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -25,6 +25,8 @@ jobs: MDB_AZURE_OPEN_AI_API_KEY: ${{ secrets.ACCURACY_AZURE_OPEN_AI_API_KEY }} MDB_AZURE_OPEN_AI_API_URL: ${{ vars.ACCURACY_AZURE_OPEN_AI_API_URL }} MDB_ACCURACY_MDB_URL: ${{ secrets.ACCURACY_MDB_CONNECTION_STRING }} + MDB_ACCURACY_MDB_DB: ${{ vars.ACCURACY_MDB_DB }} + MDB_ACCURACY_MDB_COLLECTION: ${{ vars.ACCURACY_MDB_COLLECTION }} MDB_ACCURACY_BASELINE_COMMIT: ${{ github.event.pull_request.base.sha || '' }} steps: - uses: GitHubSecurityLab/actions-permissions/monitor@v1 diff --git a/scripts/accuracy/run-accuracy-tests.sh b/scripts/accuracy/run-accuracy-tests.sh index 99a1897f..5924b4e4 100644 --- a/scripts/accuracy/run-accuracy-tests.sh +++ b/scripts/accuracy/run-accuracy-tests.sh @@ -10,6 +10,8 @@ export MDB_ACCURACY_RUN_ID=$(npx uuid v4) # For providing a mongodb based storage to store accuracy result # export MDB_ACCURACY_MDB_URL="" +# export MDB_ACCURACY_MDB_DB="" +# export MDB_ACCURACY_MDB_COLLECTION="" # By default we run all the tests under tests/accuracy folder unless a path is # specified in the command line. Such as: diff --git a/tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.ts index 390ca231..82475bff 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.ts +++ b/tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.ts @@ -3,8 +3,9 @@ import { MongoDBBasedResultStorage } from "./mongodb-storage.js"; import { AccuracyResultStorage } from "./result-storage.js"; export function getAccuracyResultStorage(): AccuracyResultStorage { - if (process.env.MDB_ACCURACY_MDB_URL) { - return new MongoDBBasedResultStorage(process.env.MDB_ACCURACY_MDB_URL); + const { MDB_ACCURACY_MDB_URL, MDB_ACCURACY_MDB_DB, MDB_ACCURACY_MDB_COLLECTION } = process.env; + if (MDB_ACCURACY_MDB_URL && MDB_ACCURACY_MDB_DB && MDB_ACCURACY_MDB_COLLECTION) { + return new MongoDBBasedResultStorage(MDB_ACCURACY_MDB_URL, MDB_ACCURACY_MDB_DB, MDB_ACCURACY_MDB_COLLECTION); } return new DiskBasedResultStorage(); } diff --git a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts index 000dce48..694a78be 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts +++ b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts @@ -13,11 +13,13 @@ export class MongoDBBasedResultStorage implements AccuracyResultStorage { constructor( connectionString: string, + database: string, + collection: string, // Omitting these as they might contain large chunk of texts private readonly omittedModelResponseFields: (keyof ModelResponse)[] = ["messages", "text"] ) { this.client = new MongoClient(connectionString); - this.resultCollection = this.client.db("mongodb-mcp-server").collection("accuracy-results"); + this.resultCollection = this.client.db(database).collection(collection); } async getAccuracyResult(commitSHA: string, runId?: string): Promise { From 1c2295a0fab9b5aa2ea6f765d912d640e8590aaa Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 14 Jul 2025 11:19:15 +0200 Subject: [PATCH 57/91] chore: ensure the generated asset directory is present --- .github/workflows/accuracy-tests.yml | 4 ++-- scripts/accuracy/generate-test-summary.ts | 5 ++++- scripts/accuracy/run-accuracy-tests.sh | 2 +- tests/accuracy/sdk/constants.ts | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index f7e22b61..9edca3a5 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -44,9 +44,9 @@ jobs: uses: actions/upload-artifact@v4 with: name: accuracy-test-summary - path: .accuracy/tests-summary.html + path: .accuracy/test-summary.html - name: Comment summary on PR if: github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests' uses: marocchino/sticky-pull-request-comment@d2ad0de260ae8b0235ce059e63f2949ba9e05943 # v2 with: - path: .accuracy/tests-summary.html + path: .accuracy/test-summary.html diff --git a/scripts/accuracy/generate-test-summary.ts b/scripts/accuracy/generate-test-summary.ts index 6b14e264..b059a473 100644 --- a/scripts/accuracy/generate-test-summary.ts +++ b/scripts/accuracy/generate-test-summary.ts @@ -1,4 +1,5 @@ -import { readFile, writeFile } from "fs/promises"; +import path from "path"; +import { readFile, writeFile, mkdir } from "fs/promises"; import { getAccuracyResultStorage } from "../../tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.js"; import { AccuracyResult, @@ -247,6 +248,8 @@ async function generateTestSummary() { const testSummary = getTestSummary(comparableAccuracyResult); const htmlReport = await generateHtmlReport(comparableAccuracyResult, testSummary, baselineInfo); + // Ensure that our writable path actually exist. + await mkdir(path.dirname(HTML_TESTS_SUMMARY_FILE), { recursive: true }); await writeFile(HTML_TESTS_SUMMARY_FILE, htmlReport, "utf8"); console.log(`✅ HTML report generated: ${HTML_TESTS_SUMMARY_FILE}`); diff --git a/scripts/accuracy/run-accuracy-tests.sh b/scripts/accuracy/run-accuracy-tests.sh index 5924b4e4..a8b08532 100644 --- a/scripts/accuracy/run-accuracy-tests.sh +++ b/scripts/accuracy/run-accuracy-tests.sh @@ -33,7 +33,7 @@ JEST_EXIT_CODE=$? # "in-progress". When all the tests are done and jest exits with an exit code of # 0, we can safely mark accuracy run as finished otherwise failed. -# This "outside-the-tests-status-update" is arising out of the fact that each +# This "outside-the-test-status-update" is arising out of the fact that each # test suite stores their own accuracy run data in the storage and this setup # might lead to data inconsistency when the tests fail. To overcome that each # accuracy result entry has a status which by default is "in-progress" and is diff --git a/tests/accuracy/sdk/constants.ts b/tests/accuracy/sdk/constants.ts index 0c70a9e9..188fff53 100644 --- a/tests/accuracy/sdk/constants.ts +++ b/tests/accuracy/sdk/constants.ts @@ -19,6 +19,6 @@ export const ACCURACY_RESULTS_DIR = path.join(GENERATED_ASSETS_DIR, "results"); export const LATEST_ACCURACY_RUN_NAME = "latest-run"; -export const HTML_TESTS_SUMMARY_FILE = path.join(GENERATED_ASSETS_DIR, "tests-summary.html"); +export const HTML_TESTS_SUMMARY_FILE = path.join(GENERATED_ASSETS_DIR, "test-summary.html"); export const HTML_TESTS_SUMMARY_TEMPLATE = path.join(RESOURCES_DIR, "test-summary-template.html"); From a3ba9e0374360bb815b790d60233d3c03e1b3de6 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 14 Jul 2025 12:36:23 +0200 Subject: [PATCH 58/91] chore: generate a markdown brief for PR comments --- .github/workflows/accuracy-tests.yml | 2 +- resources/test-summary-template.html | 16 ++-- scripts/accuracy/generate-test-summary.ts | 98 ++++++++++++++++++----- tests/accuracy/sdk/constants.ts | 4 +- 4 files changed, 91 insertions(+), 29 deletions(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index 9edca3a5..57a4fa28 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -49,4 +49,4 @@ jobs: if: github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests' uses: marocchino/sticky-pull-request-comment@d2ad0de260ae8b0235ce059e63f2949ba9e05943 # v2 with: - path: .accuracy/test-summary.html + path: .accuracy/test-brief.md diff --git a/resources/test-summary-template.html b/resources/test-summary-template.html index a5c14f33..2fa0498f 100644 --- a/resources/test-summary-template.html +++ b/resources/test-summary-template.html @@ -331,15 +331,15 @@

📈 Test Results Summary

Total Prompts Evaluated
-
{{totalTests}}
+
{{totalPrompts}}
Models Tested
-
{{modelsCount}}
+
{{totalModels}}
-
Evals with 0% Accuracy
-
{{testsWithZeroAccuracy}}
+
Responses with 0% Accuracy
+
{{responsesWithZeroAccuracy}}
Average Accuracy
@@ -368,12 +368,12 @@

🔄 Baseline Comparison

{{baselineCreatedOn}}
-
Evals Improved vs Baseline
-
{{evalsImproved}}
+
Responses Improved vs Baseline
+
{{responsesImproved}}
-
Evals Regressed vs Baseline
-
{{evalsRegressed}}
+
Responses Regressed vs Baseline
+
{{responsesRegressed}}
diff --git a/scripts/accuracy/generate-test-summary.ts b/scripts/accuracy/generate-test-summary.ts index b059a473..027944e0 100644 --- a/scripts/accuracy/generate-test-summary.ts +++ b/scripts/accuracy/generate-test-summary.ts @@ -9,7 +9,11 @@ import { ModelResponse, } from "../../tests/accuracy/sdk/accuracy-result-storage/result-storage.js"; import { getCommitSHA } from "../../tests/accuracy/sdk/git-info.js"; -import { HTML_TESTS_SUMMARY_FILE, HTML_TESTS_SUMMARY_TEMPLATE } from "../../tests/accuracy/sdk/constants.js"; +import { + HTML_TEST_SUMMARY_FILE, + HTML_TESTS_SUMMARY_TEMPLATE, + MARKDOWN_TEST_BRIEF_FILE, +} from "../../tests/accuracy/sdk/constants.js"; type ComparableAccuracyResult = Omit & { promptAndModelResponses: PromptAndModelResponse[]; @@ -109,15 +113,15 @@ function getTestSummary(comparableResult: ComparableAccuracyResult) { return { totalPrompts: new Set(responses.map((r) => r.prompt)).size, totalModels: new Set(responses.map((r) => `${r.provider} ${r.requestedModel}`)).size, - testsWithZeroAccuracy: responses.filter((r) => r.toolCallingAccuracy === 0), - testsWith75Accuracy: responses.filter((r) => r.toolCallingAccuracy === 0.75), - testsWith100Accuracy: responses.filter((r) => r.toolCallingAccuracy === 100), + responsesWithZeroAccuracy: responses.filter((r) => r.toolCallingAccuracy === 0), + responsesWith75Accuracy: responses.filter((r) => r.toolCallingAccuracy === 0.75), + responsesWith100Accuracy: responses.filter((r) => r.toolCallingAccuracy === 1), averageAccuracy: responses.length > 0 ? responses.reduce((sum, r) => sum + r.toolCallingAccuracy, 0) / responses.length : 0, - evalsImproved: responses.filter( + responsesImproved: responses.filter( (r) => typeof r.baselineToolAccuracy === "number" && r.toolCallingAccuracy > r.baselineToolAccuracy ).length, - evalsRegressed: responses.filter( + responsesRegressed: responses.filter( (r) => typeof r.baselineToolAccuracy === "number" && r.toolCallingAccuracy < r.baselineToolAccuracy ).length, reportGeneratedOn: new Date().toLocaleString(), @@ -172,9 +176,9 @@ async function generateHtmlReport( accuracyRunStatus: formatRunStatus(comparableResult.runStatus), reportGeneratedOn: testSummary.reportGeneratedOn, createdOn: testSummary.resultCreatedOn, - totalTests: String(testSummary.totalPrompts), - modelsCount: String(testSummary.totalModels), - testsWithZeroAccuracy: String(testSummary.testsWithZeroAccuracy.length), + totalPrompts: String(testSummary.totalPrompts), + totalModels: String(testSummary.totalModels), + responsesWithZeroAccuracy: String(testSummary.responsesWithZeroAccuracy.length), averageAccuracy: formatAccuracy(testSummary.averageAccuracy), baselineCommitSHA: baselineInfo?.commitSHA || "-", baselineAccuracyRunId: baselineInfo?.accuracyRunId || "-", @@ -182,12 +186,64 @@ async function generateHtmlReport( ? formatRunStatus(baselineInfo?.accuracyRunStatus) : "-", baselineCreatedOn: baselineInfo?.createdOn || "-", - evalsImproved: baselineInfo ? String(testSummary.evalsImproved) : "-", - evalsRegressed: baselineInfo ? String(testSummary.evalsRegressed) : "-", + responsesImproved: baselineInfo ? String(testSummary.responsesImproved) : "-", + responsesRegressed: baselineInfo ? String(testSummary.responsesRegressed) : "-", tableRows, }); } +function generateMarkdownBrief( + comparableResult: ComparableAccuracyResult, + testSummary: ReturnType, + baselineInfo: BaselineRunInfo | null +): string { + const markdownTexts = [ + "# 📊 Accuracy Test Results", + "## 📈 Summary", + "| Metric | Value |", + "|--------|-------|", + `| **Commit SHA** | \`${comparableResult.commitSHA}\` |`, + `| **Run ID** | \`${comparableResult.runId}\` |`, + `| **Status** | ${comparableResult.runStatus} |`, + `| **Total Prompts Evaluated** | ${testSummary.totalPrompts} |`, + `| **Models Tested** | ${testSummary.totalModels} |`, + `| **Average Accuracy** | ${formatAccuracy(testSummary.averageAccuracy)} |`, + `| **Responses with 0% Accuracy** | ${testSummary.responsesWithZeroAccuracy.length} |`, + `| **Responses with 75% Accuracy** | ${testSummary.responsesWith75Accuracy.length} |`, + `| **Responses with 100% Accuracy** | ${testSummary.responsesWith100Accuracy.length} |`, + "", + ]; + + if (baselineInfo) { + markdownTexts.push( + ...[ + "## 📊 Baseline Comparison", + "|--------|-------|", + `| **Baseline Commit** | \`${baselineInfo.commitSHA}\` |`, + `| **Baseline Run ID** | \`${baselineInfo.accuracyRunId}\` |`, + `| **Baseline Run Status** | \`${baselineInfo.accuracyRunStatus}\` |`, + `| **Responses Improved** | ${testSummary.responsesImproved} |`, + `| **Responses Regressed** | ${testSummary.responsesRegressed} |`, + "", + ] + ); + } + + const { GITHUB_SERVER_URL, GITHUB_REPOSITORY, GITHUB_RUN_ID } = process.env; + const githubRunUrl = + GITHUB_SERVER_URL && GITHUB_REPOSITORY && GITHUB_RUN_ID + ? `${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}` + : null; + + const reportLinkText = githubRunUrl + ? `📎 **[Download Full HTML Report](${githubRunUrl})** - Look for the \`accuracy-test-summary\` artifact for detailed results.` + : `📎 **Full HTML Report**: \`${HTML_TEST_SUMMARY_FILE}\``; + + markdownTexts.push(...["---", reportLinkText, "", `*Report generated on: ${testSummary.reportGeneratedOn}*`]); + + return markdownTexts.join("\n"); +} + async function generateTestSummary() { const storage = getAccuracyResultStorage(); try { @@ -244,25 +300,29 @@ async function generateTestSummary() { ), }; + // Ensure that our writable path actually exist. + await mkdir(path.dirname(HTML_TEST_SUMMARY_FILE), { recursive: true }); + console.log(`\n📊 Generating test summary for accuracy run: ${accuracyRunId}\n`); const testSummary = getTestSummary(comparableAccuracyResult); - const htmlReport = await generateHtmlReport(comparableAccuracyResult, testSummary, baselineInfo); - // Ensure that our writable path actually exist. - await mkdir(path.dirname(HTML_TESTS_SUMMARY_FILE), { recursive: true }); - await writeFile(HTML_TESTS_SUMMARY_FILE, htmlReport, "utf8"); + const htmlReport = await generateHtmlReport(comparableAccuracyResult, testSummary, baselineInfo); + await writeFile(HTML_TEST_SUMMARY_FILE, htmlReport, "utf8"); + console.log(`✅ HTML report generated: ${HTML_TEST_SUMMARY_FILE}`); - console.log(`✅ HTML report generated: ${HTML_TESTS_SUMMARY_FILE}`); + const markdownBrief = generateMarkdownBrief(comparableAccuracyResult, testSummary, baselineInfo); + await writeFile(MARKDOWN_TEST_BRIEF_FILE, markdownBrief, "utf8"); + console.log(`✅ Markdown brief generated: ${MARKDOWN_TEST_BRIEF_FILE}`); console.log(`\n📈 Summary:`); console.log(` Total prompts evaluated: ${testSummary.totalPrompts}`); console.log(` Models tested: ${testSummary.totalModels}`); - console.log(` Evals with 0% accuracy: ${testSummary.testsWithZeroAccuracy.length}`); + console.log(` Responses with 0% accuracy: ${testSummary.responsesWithZeroAccuracy.length}`); if (baselineCommit) { console.log(` Baseline commit: ${baselineCommit}`); - console.log(` Evals improved vs baseline: ${testSummary.evalsImproved}`); - console.log(` Evals regressed vs baseline: ${testSummary.evalsRegressed}`); + console.log(` Responses improved vs baseline: ${testSummary.responsesImproved}`); + console.log(` Responses regressed vs baseline: ${testSummary.responsesRegressed}`); } } catch (error) { console.error("Error generating test summary:", error); diff --git a/tests/accuracy/sdk/constants.ts b/tests/accuracy/sdk/constants.ts index 188fff53..c59534e3 100644 --- a/tests/accuracy/sdk/constants.ts +++ b/tests/accuracy/sdk/constants.ts @@ -19,6 +19,8 @@ export const ACCURACY_RESULTS_DIR = path.join(GENERATED_ASSETS_DIR, "results"); export const LATEST_ACCURACY_RUN_NAME = "latest-run"; -export const HTML_TESTS_SUMMARY_FILE = path.join(GENERATED_ASSETS_DIR, "test-summary.html"); +export const HTML_TEST_SUMMARY_FILE = path.join(GENERATED_ASSETS_DIR, "test-summary.html"); + +export const MARKDOWN_TEST_BRIEF_FILE = path.join(GENERATED_ASSETS_DIR, "test-brief.md"); export const HTML_TESTS_SUMMARY_TEMPLATE = path.join(RESOURCES_DIR, "test-summary-template.html"); From bf0e696c5e4f1b03166b3dea8d4797aabaa689f9 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 14 Jul 2025 15:03:16 +0200 Subject: [PATCH 59/91] chore: use lockfile for updating local test results --- package-lock.json | 41 ++++++ package.json | 6 +- .../accuracy-result-storage/disk-storage.ts | 138 +++++++++++------- 3 files changed, 132 insertions(+), 53 deletions(-) diff --git a/package-lock.json b/package-lock.json index 95cf8163..5f32bb57 100644 --- a/package-lock.json +++ b/package-lock.json @@ -39,6 +39,7 @@ "@modelcontextprotocol/inspector": "^0.16.0", "@redocly/cli": "^1.34.4", "@types/node": "^24.0.12", + "@types/proper-lockfile": "^4.1.4", "@types/simple-oauth2": "^5.0.7", "@types/yargs-parser": "^21.0.3", "@vitest/coverage-v8": "^3.2.4", @@ -53,6 +54,7 @@ "openapi-types": "^12.1.3", "openapi-typescript": "^7.8.0", "prettier": "^3.6.2", + "proper-lockfile": "^4.1.2", "tsx": "^4.20.3", "typescript": "^5.8.3", "typescript-eslint": "^8.36.0", @@ -5402,6 +5404,23 @@ "undici-types": "~7.8.0" } }, + "node_modules/@types/proper-lockfile": { + "version": "4.1.4", + "resolved": "https://registry.npmjs.org/@types/proper-lockfile/-/proper-lockfile-4.1.4.tgz", + "integrity": "sha512-uo2ABllncSqg9F1D4nugVl9v93RmjxF6LJzQLMLDdPaXCUIDPeOJ21Gbqi43xNKzBi/WQ0Q0dICqufzQbMjipQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/retry": "*" + } + }, + "node_modules/@types/retry": { + "version": "0.12.5", + "resolved": "https://registry.npmjs.org/@types/retry/-/retry-0.12.5.tgz", + "integrity": "sha512-3xSjTp3v03X/lSQLkczaN9UIEwJMoMCA1+Nb5HfbJEQWogdeQIyVtTvxPXDQjZ5zws8rFQfVfRdz03ARihPJgw==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/simple-oauth2": { "version": "5.0.7", "resolved": "https://registry.npmjs.org/@types/simple-oauth2/-/simple-oauth2-5.0.7.tgz", @@ -11132,6 +11151,18 @@ "dev": true, "license": "MIT" }, + "node_modules/proper-lockfile": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/proper-lockfile/-/proper-lockfile-4.1.2.tgz", + "integrity": "sha512-TjNPblN4BwAWMXU8s9AEz4JmQxnD1NNL7bNOY/AKUzyamc379FWASUhc/K1pL2noVb+XmZKLL68cjzLsiOAMaA==", + "dev": true, + "license": "MIT", + "dependencies": { + "graceful-fs": "^4.2.4", + "retry": "^0.12.0", + "signal-exit": "^3.0.2" + } + }, "node_modules/protobufjs": { "version": "7.5.0", "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.0.tgz", @@ -11553,6 +11584,16 @@ "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1" } }, + "node_modules/retry": { + "version": "0.12.0", + "resolved": "https://registry.npmjs.org/retry/-/retry-0.12.0.tgz", + "integrity": "sha512-9LkiTwjUh6rT555DtE9rTX+BKByPfrMzEAtnlEtdEwr3Nkffwiihqe2bWADg+OQRjt9gl6ICdmB/ZFDCGAtSow==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 4" + } + }, "node_modules/reusify": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.1.0.tgz", diff --git a/package.json b/package.json index c8687298..8958a95f 100644 --- a/package.json +++ b/package.json @@ -36,12 +36,13 @@ "license": "Apache-2.0", "devDependencies": { "@ai-sdk/azure": "^1.3.24", + "@ai-sdk/google": "^1.2.22", "@ai-sdk/openai": "^1.3.23", "@eslint/js": "^9.30.1", - "@ai-sdk/google": "^1.2.22", "@modelcontextprotocol/inspector": "^0.16.0", "@redocly/cli": "^1.34.4", "@types/node": "^24.0.12", + "@types/proper-lockfile": "^4.1.4", "@types/simple-oauth2": "^5.0.7", "@types/yargs-parser": "^21.0.3", "@vitest/coverage-v8": "^3.2.4", @@ -56,11 +57,12 @@ "openapi-types": "^12.1.3", "openapi-typescript": "^7.8.0", "prettier": "^3.6.2", + "proper-lockfile": "^4.1.2", + "simple-git": "^3.28.0", "tsx": "^4.20.3", "typescript": "^5.8.3", "typescript-eslint": "^8.36.0", "vitest": "^3.2.4", - "simple-git": "^3.28.0", "uuid": "^11.1.0", "yaml": "^2.8.0" }, diff --git a/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts index 204a553b..32b97056 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts +++ b/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts @@ -1,5 +1,6 @@ import path from "path"; import fs from "fs/promises"; +import { lock } from "proper-lockfile"; import { ACCURACY_RESULTS_DIR, LATEST_ACCURACY_RUN_NAME } from "../constants.js"; import { AccuracyResult, @@ -25,7 +26,7 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { const raw = await fs.readFile(filePath, "utf8"); return JSON.parse(raw) as AccuracyResult; } catch (error) { - if ((error as { code: string }).code === "ENOENT") { + if ((error as NodeJS.ErrnoException).code === "ENOENT") { return null; } throw error; @@ -33,19 +34,35 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { } async updateRunStatus(commitSHA: string, runId: string, status: AccuracyRunStatuses): Promise { - await this.atomicWriteResult(commitSHA, runId, async () => { + const resultFilePath = this.getAccuracyResultFilePath(commitSHA, runId); + const release = await lock(resultFilePath, { retries: 10 }); + try { const accuracyResult = await this.getAccuracyResult(commitSHA, runId); if (!accuracyResult) { - throw new Error( - `Cannot update run status to ${status} for commit - ${commitSHA}, runId - ${runId}. Results not found!` - ); + throw new Error("Results not found!"); } - return { - ...accuracyResult, - runStatus: status, - }; - }); + await fs.writeFile( + resultFilePath, + JSON.stringify( + { + ...accuracyResult, + runStatus: status, + }, + null, + 2 + ), + { encoding: "utf8" } + ); + } catch (error) { + console.warn( + `Could not update run status to ${status} for commit - ${commitSHA}, runId - ${runId}.`, + error + ); + throw error; + } finally { + await release(); + } // This bit is important to mark the current run as the latest run for a // commit so that we can use that during baseline comparison. @@ -63,10 +80,11 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { prompt: string, modelResponse: ModelResponse ): Promise { - await this.atomicWriteResult(commitSHA, runId, async () => { - const accuracyResult = await this.getAccuracyResult(commitSHA, runId); - if (!accuracyResult) { - return { + const resultFilePath = this.getAccuracyResultFilePath(commitSHA, runId); + const { fileCreatedWithInitialData } = await this.ensureAccuracyResultFile( + resultFilePath, + JSON.stringify( + { runId, runStatus: AccuracyRunStatus.InProgress, createdOn: Date.now(), @@ -77,22 +95,43 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { modelResponses: [modelResponse], }, ], - }; + }, + null, + 2 + ) + ); + + if (fileCreatedWithInitialData) { + return; + } + + const releaseLock = await lock(resultFilePath, { retries: 10 }); + try { + const accuracyResult = await this.getAccuracyResult(commitSHA, runId); + if (!accuracyResult) { + throw new Error("Expected at-least initial accuracy result to be present"); } const existingPromptIdx = accuracyResult.promptResults.findIndex((result) => result.prompt === prompt); const promptResult = accuracyResult.promptResults[existingPromptIdx]; if (!promptResult) { - return { - ...accuracyResult, - promptResults: [ - ...accuracyResult.promptResults, + return await fs.writeFile( + resultFilePath, + JSON.stringify( { - prompt, - modelResponses: [modelResponse], + ...accuracyResult, + promptResults: [ + ...accuracyResult.promptResults, + { + prompt, + modelResponses: [modelResponse], + }, + ], }, - ], - }; + null, + 2 + ) + ); } accuracyResult.promptResults.splice(existingPromptIdx, 1, { @@ -100,41 +139,38 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { modelResponses: [...promptResult.modelResponses, modelResponse], }); - return accuracyResult; - }); + return await fs.writeFile(resultFilePath, JSON.stringify(accuracyResult, null, 2)); + } catch (error) { + console.warn(`Could not save model response for commit - ${commitSHA}, runId - ${runId}.`, error); + throw error; + } finally { + await releaseLock?.(); + } } close(): Promise { return Promise.resolve(); } - private async atomicWriteResult( - commitSHA: string, - runId: string, - generateResult: () => Promise - ): Promise { - for (let attempt = 0; attempt < 10; attempt++) { - // This should happen outside the try catch to let the result - // generation error bubble up. - const result = await generateResult(); - const resultFilePath = this.getAccuracyResultFilePath(commitSHA, runId); - try { - const tmp = `${resultFilePath}~${Date.now()}`; - await fs.writeFile(tmp, JSON.stringify(result, null, 2)); - await fs.rename(tmp, resultFilePath); - return; - } catch (error) { - if ((error as { code: string }).code === "ENOENT") { - const baseDir = path.dirname(resultFilePath); - await fs.mkdir(baseDir, { recursive: true }); - } - - if (attempt < 10) { - await this.waitFor(100 + Math.random() * 200); - } else { - throw error; - } + private async ensureAccuracyResultFile( + filePath: string, + initialData: string + ): Promise<{ + fileCreatedWithInitialData: boolean; + }> { + try { + await fs.mkdir(path.dirname(filePath), { recursive: true }); + await fs.writeFile(filePath, initialData, { flag: "wx" }); + return { + fileCreatedWithInitialData: true, + }; + } catch (error) { + if ((error as NodeJS.ErrnoException).code === "EEXIST") { + return { + fileCreatedWithInitialData: false, + }; } + throw error; } } From e845e1a523559bc83760c124de1d94b3f19d9c95 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 14 Jul 2025 15:40:58 +0200 Subject: [PATCH 60/91] chore: make expectedToolCalls part of PromptResult --- scripts/accuracy/generate-test-summary.ts | 2 + .../accuracy-result-storage/disk-storage.ts | 52 +++++++++++-------- .../mongodb-storage.ts | 22 +++++--- .../accuracy-result-storage/result-storage.ts | 21 ++++---- tests/accuracy/sdk/describe-accuracy-tests.ts | 25 +++++---- 5 files changed, 73 insertions(+), 49 deletions(-) diff --git a/scripts/accuracy/generate-test-summary.ts b/scripts/accuracy/generate-test-summary.ts index 027944e0..5dc80801 100644 --- a/scripts/accuracy/generate-test-summary.ts +++ b/scripts/accuracy/generate-test-summary.ts @@ -21,6 +21,7 @@ type ComparableAccuracyResult = Omit & { interface PromptAndModelResponse extends ModelResponse { prompt: string; + expectedToolCalls: ExpectedToolCall[]; baselineToolAccuracy?: number; } @@ -293,6 +294,7 @@ async function generateTestSummary() { return { ...currentModelResponse, prompt: currentPromptResult.prompt, + expectedToolCalls: currentPromptResult.expectedToolCalls, baselineToolAccuracy: baselineModelResponse?.toolCallingAccuracy, }; }); diff --git a/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts index 32b97056..36e57d18 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts +++ b/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts @@ -7,6 +7,7 @@ import { AccuracyResultStorage, AccuracyRunStatus, AccuracyRunStatuses, + ExpectedToolCall, ModelResponse, } from "./result-storage.js"; @@ -74,31 +75,36 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { } } - async saveModelResponseForPrompt( - commitSHA: string, - runId: string, - prompt: string, - modelResponse: ModelResponse - ): Promise { + async saveModelResponseForPrompt({ + commitSHA, + runId, + prompt, + expectedToolCalls, + modelResponse, + }: { + commitSHA: string; + runId: string; + prompt: string; + expectedToolCalls: ExpectedToolCall[]; + modelResponse: ModelResponse; + }): Promise { + const initialData: AccuracyResult = { + runId, + runStatus: AccuracyRunStatus.InProgress, + createdOn: Date.now(), + commitSHA, + promptResults: [ + { + prompt, + expectedToolCalls, + modelResponses: [modelResponse], + }, + ], + }; const resultFilePath = this.getAccuracyResultFilePath(commitSHA, runId); const { fileCreatedWithInitialData } = await this.ensureAccuracyResultFile( resultFilePath, - JSON.stringify( - { - runId, - runStatus: AccuracyRunStatus.InProgress, - createdOn: Date.now(), - commitSHA, - promptResults: [ - { - prompt, - modelResponses: [modelResponse], - }, - ], - }, - null, - 2 - ) + JSON.stringify(initialData, null, 2) ); if (fileCreatedWithInitialData) { @@ -124,6 +130,7 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { ...accuracyResult.promptResults, { prompt, + expectedToolCalls, modelResponses: [modelResponse], }, ], @@ -136,6 +143,7 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { accuracyResult.promptResults.splice(existingPromptIdx, 1, { prompt: promptResult.prompt, + expectedToolCalls: promptResult.expectedToolCalls, modelResponses: [...promptResult.modelResponses, modelResponse], }); diff --git a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts index 694a78be..2796dfe4 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts +++ b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts @@ -4,6 +4,7 @@ import { AccuracyResultStorage, AccuracyRunStatus, AccuracyRunStatuses, + ExpectedToolCall, ModelResponse, } from "./result-storage.js"; @@ -48,12 +49,19 @@ export class MongoDBBasedResultStorage implements AccuracyResultStorage { ); } - async saveModelResponseForPrompt( - commitSHA: string, - runId: string, - prompt: string, - modelResponse: ModelResponse - ): Promise { + async saveModelResponseForPrompt({ + commitSHA, + runId, + prompt, + expectedToolCalls, + modelResponse, + }: { + commitSHA: string; + runId: string; + prompt: string; + expectedToolCalls: ExpectedToolCall[]; + modelResponse: ModelResponse; + }): Promise { const savedModelResponse: ModelResponse = { ...modelResponse }; for (const field of this.omittedModelResponseFields) { delete savedModelResponse[field]; @@ -81,7 +89,7 @@ export class MongoDBBasedResultStorage implements AccuracyResultStorage { }, { $push: { - promptResults: { prompt, modelResponses: [] }, + promptResults: { prompt, expectedToolCalls, modelResponses: [] }, }, } ); diff --git a/tests/accuracy/sdk/accuracy-result-storage/result-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/result-storage.ts index 737ee32d..845af8a0 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/result-storage.ts +++ b/tests/accuracy/sdk/accuracy-result-storage/result-storage.ts @@ -42,6 +42,10 @@ export interface PromptResult { /** * The actual prompt that was provided to LLM as test */ prompt: string; + /** + * A list of tools, along with their parameters, that are expected to be + * called by the LLM in test. */ + expectedToolCalls: ExpectedToolCall[]; /** * The responses from the LLMs tested, when provided with the prompt. */ modelResponses: ModelResponse[]; @@ -65,10 +69,6 @@ export interface ModelResponse { * were called by LLM when responding to the provided prompts. To know more * about how this number is generated, check - toolCallingAccuracy.ts */ toolCallingAccuracy: number; - /** - * A list of tools, along with their parameters, that are expected to be - * called by the LLM in test. */ - expectedToolCalls: ExpectedToolCall[]; /** * A list of tools, along with their parameters, that were actually called * by the LLM in test. */ @@ -106,11 +106,12 @@ export interface AccuracyResultStorage { /** * Attempts to atomically insert the model response for the prompt in the * stored accuracy result. */ - saveModelResponseForPrompt( - commitSHA: string, - runId: string, - prompt: string, - modelResponse: ModelResponse - ): Promise; + saveModelResponseForPrompt(data: { + commitSHA: string; + runId: string; + prompt: string; + expectedToolCalls: ExpectedToolCall[]; + modelResponse: ModelResponse; + }): Promise; close(): Promise; } diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index b964cd48..02e1b9e2 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -102,17 +102,22 @@ export function describeAccuracyTests(models: TestableModels, accuracyTestConfig const toolCallingAccuracy = calculateToolCallingAccuracy(testConfig.expectedToolCalls, llmToolCalls); const responseTime = timeAfterPrompt - timeBeforePrompt; - await accuracyResultStorage.saveModelResponseForPrompt(commitSHA, accuracyRunId, testConfig.prompt, { - provider: model.provider, - requestedModel: model.modelName, - respondingModel: result.respondingModel, - llmResponseTime: responseTime, - toolCallingAccuracy: toolCallingAccuracy, + await accuracyResultStorage.saveModelResponseForPrompt({ + commitSHA, + runId: accuracyRunId, + prompt: testConfig.prompt, expectedToolCalls: testConfig.expectedToolCalls, - llmToolCalls: llmToolCalls, - tokensUsed: result.tokensUsage, - text: result.text, - messages: result.messages, + modelResponse: { + provider: model.provider, + requestedModel: model.modelName, + respondingModel: result.respondingModel, + llmResponseTime: responseTime, + toolCallingAccuracy: toolCallingAccuracy, + llmToolCalls: llmToolCalls, + tokensUsed: result.tokensUsage, + text: result.text, + messages: result.messages, + }, }); }); }); From 4f41af575838ed1676b8ef56dc238c664abea96d Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 14 Jul 2025 15:54:23 +0200 Subject: [PATCH 61/91] chore: make omitted fields a const --- .../sdk/accuracy-result-storage/mongodb-storage.ts | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts index 2796dfe4..6adb6e85 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts +++ b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts @@ -8,17 +8,14 @@ import { ModelResponse, } from "./result-storage.js"; +// Omitting these as they might contain large chunk of texts +const OMITTED_MODEL_RESPONSE_FIELDS: (keyof ModelResponse)[] = ["messages", "text"]; + export class MongoDBBasedResultStorage implements AccuracyResultStorage { private client: MongoClient; private resultCollection: Collection; - constructor( - connectionString: string, - database: string, - collection: string, - // Omitting these as they might contain large chunk of texts - private readonly omittedModelResponseFields: (keyof ModelResponse)[] = ["messages", "text"] - ) { + constructor(connectionString: string, database: string, collection: string) { this.client = new MongoClient(connectionString); this.resultCollection = this.client.db(database).collection(collection); } @@ -63,7 +60,7 @@ export class MongoDBBasedResultStorage implements AccuracyResultStorage { modelResponse: ModelResponse; }): Promise { const savedModelResponse: ModelResponse = { ...modelResponse }; - for (const field of this.omittedModelResponseFields) { + for (const field of OMITTED_MODEL_RESPONSE_FIELDS) { delete savedModelResponse[field]; } From e421125408df9adad04059fa5d83e08e27e81fa9 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 14 Jul 2025 15:59:17 +0200 Subject: [PATCH 62/91] chore: update formatRunStatus as per feedback --- scripts/accuracy/generate-test-summary.ts | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/scripts/accuracy/generate-test-summary.ts b/scripts/accuracy/generate-test-summary.ts index 5dc80801..2c73cc7c 100644 --- a/scripts/accuracy/generate-test-summary.ts +++ b/scripts/accuracy/generate-test-summary.ts @@ -37,15 +37,13 @@ function populateTemplate(template: string, data: Record): strin } function formatRunStatus(status: AccuracyRunStatuses) { - let statusClass = "chip run-status"; + const statusClasses = ["chip", "run-status"]; if (status === "done") { - statusClass += " perfect"; - } else if (status === "in-progress") { - statusClass += " poor"; - } else if (status === "failed") { - statusClass += " poor"; + statusClasses.push("perfect"); + } else if (status === "in-progress" || status === "failed") { + statusClasses.push("poor"); } - return `${status}`; + return `${status}`; } function formatAccuracy(accuracy: number): string { @@ -76,7 +74,7 @@ function formatTokenUsage(tokensUsage: { const prompt = tokensUsage.promptTokens || "-"; const completion = tokensUsage.completionTokens || "-"; - const tooltip = `Prompt: ${prompt}\nCompletion: ${completion}\nTotal: ${total}`; + const tooltip = [`Prompt: ${prompt}`, `Completion: ${completion}`, `Total: ${total}`].join("\n"); return `${total}`; } From 2c2c4287dd3f44e696bf5d48f5c641f01eb5a26a Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 14 Jul 2025 17:55:54 +0200 Subject: [PATCH 63/91] chore: move saveModelResponseForPromptAtomic to atomic update pipeline --- .../mongodb-storage.ts | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts index 6adb6e85..38cccbae 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts +++ b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts @@ -104,6 +104,95 @@ export class MongoDBBasedResultStorage implements AccuracyResultStorage { ); } + async saveModelResponseForPromptAtomic({ + commitSHA, + runId, + prompt, + expectedToolCalls, + modelResponse, + }: { + commitSHA: string; + runId: string; + prompt: string; + expectedToolCalls: ExpectedToolCall[]; + modelResponse: ModelResponse; + }): Promise { + const savedModelResponse: ModelResponse = { ...modelResponse }; + for (const field of OMITTED_MODEL_RESPONSE_FIELDS) { + delete savedModelResponse[field]; + } + + await this.resultCollection.updateOne( + { commitSHA, runId }, + [ + { + $set: { + runStatus: { + $ifNull: ["$runStatus", AccuracyRunStatus.InProgress], + }, + createdOn: { + $ifNull: ["$createdOn", Date.now()], + }, + commitSHA: commitSHA, + runId: runId, + promptResults: { + $let: { + vars: { + existingPrompts: { $ifNull: ["$promptResults", []] }, + promptExists: { + $in: [ + prompt, + { + $ifNull: [ + { $map: { input: "$promptResults", as: "pr", in: "$$pr.prompt" } }, + [], + ], + }, + ], + }, + }, + in: { + $map: { + input: { + $cond: { + if: "$$promptExists", + then: "$$existingPrompts", + else: { + $concatArrays: [ + "$$existingPrompts", + [{ prompt, expectedToolCalls, modelResponses: [] }], + ], + }, + }, + }, + as: "promptResult", + in: { + $cond: { + if: { $eq: ["$$promptResult.prompt", prompt] }, + then: { + prompt: "$$promptResult.prompt", + expectedToolCalls: "$$promptResult.expectedToolCalls", + modelResponses: { + $concatArrays: [ + "$$promptResult.modelResponses", + [savedModelResponse], + ], + }, + }, + else: "$$promptResult", + }, + }, + }, + }, + }, + }, + }, + }, + ], + { upsert: true } + ); + } + async close(): Promise { await this.client.close(); } From 34214adceedcf3ad99baf39c8765d8a10b48611a Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 15 Jul 2025 10:34:46 +0200 Subject: [PATCH 64/91] chore: prefer exclusive reads for public interface --- .../accuracy-result-storage/disk-storage.ts | 36 +++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts index 36e57d18..06a5480d 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts +++ b/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts @@ -12,7 +12,23 @@ import { } from "./result-storage.js"; export class DiskBasedResultStorage implements AccuracyResultStorage { - async getAccuracyResult(commitSHA: string, runId?: string): Promise { + /** + * + * @param commitSHA The commit for which accuracy result needs to be + * fetched. + * @param runId An optional runId to get the result for. If the runId is not + * provided then the result of the latest run are fetched. + * @param preferExclusiveRead An optional flag, which when set to false, + * will not lock the result file before reading otherwise the default + * behavior is to lock the result file before reading. This should always be + * set to false when the calling context already holds the lock on the + * result file. + */ + async getAccuracyResult( + commitSHA: string, + runId?: string, + preferExclusiveRead?: boolean + ): Promise { const filePath = runId ? // If we have both commit and runId then we get the path for // specific file. Common case when saving prompt responses during an @@ -23,6 +39,10 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { // marked as successful. this.getAccuracyResultFilePath(commitSHA, LATEST_ACCURACY_RUN_NAME); + let releaseLock: (() => Promise) | undefined; + if (preferExclusiveRead !== false) { + releaseLock = await lock(filePath); + } try { const raw = await fs.readFile(filePath, "utf8"); return JSON.parse(raw) as AccuracyResult; @@ -31,14 +51,17 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { return null; } throw error; + } finally { + await releaseLock?.(); } } async updateRunStatus(commitSHA: string, runId: string, status: AccuracyRunStatuses): Promise { const resultFilePath = this.getAccuracyResultFilePath(commitSHA, runId); - const release = await lock(resultFilePath, { retries: 10 }); + let releaseLock: (() => Promise) | undefined; try { - const accuracyResult = await this.getAccuracyResult(commitSHA, runId); + releaseLock = await lock(resultFilePath, { retries: 10 }); + const accuracyResult = await this.getAccuracyResult(commitSHA, runId, false); if (!accuracyResult) { throw new Error("Results not found!"); } @@ -62,7 +85,7 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { ); throw error; } finally { - await release(); + await releaseLock?.(); } // This bit is important to mark the current run as the latest run for a @@ -111,9 +134,10 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { return; } - const releaseLock = await lock(resultFilePath, { retries: 10 }); + let releaseLock: (() => Promise) | undefined; try { - const accuracyResult = await this.getAccuracyResult(commitSHA, runId); + releaseLock = await lock(resultFilePath, { retries: 10 }); + const accuracyResult = await this.getAccuracyResult(commitSHA, runId, false); if (!accuracyResult) { throw new Error("Expected at-least initial accuracy result to be present"); } From 508f906b4a57547210e8dc19d7bfe3c188c349c8 Mon Sep 17 00:00:00 2001 From: Nikola Irinchev Date: Tue, 15 Jul 2025 11:25:38 +0200 Subject: [PATCH 65/91] chore: minor refactor of disk-storage (#370) --- .../accuracy-result-storage/disk-storage.ts | 105 ++++++------------ 1 file changed, 37 insertions(+), 68 deletions(-) diff --git a/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts index 06a5480d..bc09da9e 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts +++ b/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts @@ -58,9 +58,7 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { async updateRunStatus(commitSHA: string, runId: string, status: AccuracyRunStatuses): Promise { const resultFilePath = this.getAccuracyResultFilePath(commitSHA, runId); - let releaseLock: (() => Promise) | undefined; - try { - releaseLock = await lock(resultFilePath, { retries: 10 }); + await this.withFileLock(resultFilePath, async () => { const accuracyResult = await this.getAccuracyResult(commitSHA, runId, false); if (!accuracyResult) { throw new Error("Results not found!"); @@ -78,23 +76,16 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { ), { encoding: "utf8" } ); - } catch (error) { - console.warn( - `Could not update run status to ${status} for commit - ${commitSHA}, runId - ${runId}.`, - error - ); - throw error; - } finally { - await releaseLock?.(); - } + }); // This bit is important to mark the current run as the latest run for a // commit so that we can use that during baseline comparison. if (status === AccuracyRunStatus.Done) { - await this.atomicUpdateLink( - this.getAccuracyResultFilePath(commitSHA, runId), - this.getLatestResultFilePath(commitSHA) - ); + const latestResultFilePath = this.getLatestResultFilePath(commitSHA); + await this.withFileLock(latestResultFilePath, async () => { + await fs.unlink(latestResultFilePath); + await fs.link(resultFilePath, latestResultFilePath); + }); } } @@ -134,50 +125,36 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { return; } - let releaseLock: (() => Promise) | undefined; - try { - releaseLock = await lock(resultFilePath, { retries: 10 }); - const accuracyResult = await this.getAccuracyResult(commitSHA, runId, false); + await this.withFileLock(resultFilePath, async () => { + let accuracyResult = await this.getAccuracyResult(commitSHA, runId, false); if (!accuracyResult) { throw new Error("Expected at-least initial accuracy result to be present"); } const existingPromptIdx = accuracyResult.promptResults.findIndex((result) => result.prompt === prompt); const promptResult = accuracyResult.promptResults[existingPromptIdx]; - if (!promptResult) { - return await fs.writeFile( - resultFilePath, - JSON.stringify( + if (promptResult) { + accuracyResult.promptResults.splice(existingPromptIdx, 1, { + prompt: promptResult.prompt, + expectedToolCalls: promptResult.expectedToolCalls, + modelResponses: [...promptResult.modelResponses, modelResponse], + }); + } else { + accuracyResult = { + ...accuracyResult, + promptResults: [ + ...accuracyResult.promptResults, { - ...accuracyResult, - promptResults: [ - ...accuracyResult.promptResults, - { - prompt, - expectedToolCalls, - modelResponses: [modelResponse], - }, - ], + prompt, + expectedToolCalls, + modelResponses: [modelResponse], }, - null, - 2 - ) - ); + ], + }; } - accuracyResult.promptResults.splice(existingPromptIdx, 1, { - prompt: promptResult.prompt, - expectedToolCalls: promptResult.expectedToolCalls, - modelResponses: [...promptResult.modelResponses, modelResponse], - }); - - return await fs.writeFile(resultFilePath, JSON.stringify(accuracyResult, null, 2)); - } catch (error) { - console.warn(`Could not save model response for commit - ${commitSHA}, runId - ${runId}.`, error); - throw error; - } finally { - await releaseLock?.(); - } + await fs.writeFile(resultFilePath, JSON.stringify(accuracyResult, null, 2)); + }); } close(): Promise { @@ -206,20 +183,16 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { } } - private async atomicUpdateLink(filePath: string, linkPath: string) { - for (let attempt = 0; attempt < 10; attempt++) { - try { - const tempLinkPath = `${linkPath}~${Date.now()}`; - await fs.link(filePath, tempLinkPath); - await fs.rename(tempLinkPath, linkPath); - return; - } catch (error) { - if (attempt < 10) { - await this.waitFor(100 + Math.random() * 200); - } else { - throw error; - } - } + private async withFileLock(filePath: string, callback: () => Promise): Promise { + let releaseLock: (() => Promise) | undefined; + try { + releaseLock = await lock(filePath, { retries: 10 }); + await callback(); + } catch (error) { + console.warn(`Could not acquire lock for file - ${filePath}.`, error); + throw error; + } finally { + await releaseLock?.(); } } @@ -230,8 +203,4 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { private getLatestResultFilePath(commitSHA: string): string { return path.join(ACCURACY_RESULTS_DIR, commitSHA, `${LATEST_ACCURACY_RUN_NAME}.json`); } - - private waitFor(ms: number) { - return new Promise((resolve) => setTimeout(resolve, ms)); - } } From d3f1f7354fab9ad16e549a2174f10f435da4250d Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 15 Jul 2025 11:43:02 +0200 Subject: [PATCH 66/91] chore: simplify getAccuracyResult --- .../accuracy-result-storage/disk-storage.ts | 54 +++++++------------ 1 file changed, 18 insertions(+), 36 deletions(-) diff --git a/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts index bc09da9e..e138257b 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts +++ b/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts @@ -12,23 +12,7 @@ import { } from "./result-storage.js"; export class DiskBasedResultStorage implements AccuracyResultStorage { - /** - * - * @param commitSHA The commit for which accuracy result needs to be - * fetched. - * @param runId An optional runId to get the result for. If the runId is not - * provided then the result of the latest run are fetched. - * @param preferExclusiveRead An optional flag, which when set to false, - * will not lock the result file before reading otherwise the default - * behavior is to lock the result file before reading. This should always be - * set to false when the calling context already holds the lock on the - * result file. - */ - async getAccuracyResult( - commitSHA: string, - runId?: string, - preferExclusiveRead?: boolean - ): Promise { + async getAccuracyResult(commitSHA: string, runId?: string): Promise { const filePath = runId ? // If we have both commit and runId then we get the path for // specific file. Common case when saving prompt responses during an @@ -39,27 +23,13 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { // marked as successful. this.getAccuracyResultFilePath(commitSHA, LATEST_ACCURACY_RUN_NAME); - let releaseLock: (() => Promise) | undefined; - if (preferExclusiveRead !== false) { - releaseLock = await lock(filePath); - } - try { - const raw = await fs.readFile(filePath, "utf8"); - return JSON.parse(raw) as AccuracyResult; - } catch (error) { - if ((error as NodeJS.ErrnoException).code === "ENOENT") { - return null; - } - throw error; - } finally { - await releaseLock?.(); - } + return this.withFileLock(filePath, () => this.getAccuracyResultWithoutLock(filePath)); } async updateRunStatus(commitSHA: string, runId: string, status: AccuracyRunStatuses): Promise { const resultFilePath = this.getAccuracyResultFilePath(commitSHA, runId); await this.withFileLock(resultFilePath, async () => { - const accuracyResult = await this.getAccuracyResult(commitSHA, runId, false); + const accuracyResult = await this.getAccuracyResultWithoutLock(resultFilePath); if (!accuracyResult) { throw new Error("Results not found!"); } @@ -126,7 +96,7 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { } await this.withFileLock(resultFilePath, async () => { - let accuracyResult = await this.getAccuracyResult(commitSHA, runId, false); + let accuracyResult = await this.getAccuracyResultWithoutLock(resultFilePath); if (!accuracyResult) { throw new Error("Expected at-least initial accuracy result to be present"); } @@ -161,6 +131,18 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { return Promise.resolve(); } + private async getAccuracyResultWithoutLock(filePath: string): Promise { + try { + const raw = await fs.readFile(filePath, "utf8"); + return JSON.parse(raw) as AccuracyResult; + } catch (error) { + if ((error as NodeJS.ErrnoException).code === "ENOENT") { + return null; + } + throw error; + } + } + private async ensureAccuracyResultFile( filePath: string, initialData: string @@ -183,11 +165,11 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { } } - private async withFileLock(filePath: string, callback: () => Promise): Promise { + private async withFileLock(filePath: string, callback: () => Promise): Promise { let releaseLock: (() => Promise) | undefined; try { releaseLock = await lock(filePath, { retries: 10 }); - await callback(); + return await callback(); } catch (error) { console.warn(`Could not acquire lock for file - ${filePath}.`, error); throw error; From ea127bf01d4ebd48972c41bc5a521e14f9561807 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 15 Jul 2025 12:24:25 +0200 Subject: [PATCH 67/91] chore: simplified the update pipeline and added tool call serialization --- .../mongodb-storage.ts | 201 ++++++++---------- 1 file changed, 94 insertions(+), 107 deletions(-) diff --git a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts index 38cccbae..21ccb4e2 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts +++ b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts @@ -5,19 +5,38 @@ import { AccuracyRunStatus, AccuracyRunStatuses, ExpectedToolCall, + LLMToolCall, ModelResponse, + PromptResult, } from "./result-storage.js"; // Omitting these as they might contain large chunk of texts const OMITTED_MODEL_RESPONSE_FIELDS: (keyof ModelResponse)[] = ["messages", "text"]; +// The LLMToolCalls and ExpectedToolCalls are expected to have mongodb operators +// nested in the objects. This interferes with the update operation that we do +// on the accuracy result document to save the model responses which is why we +// serialize them before saving and deserialize them on fetch. +type SavedAccuracyResult = Omit & { + promptResults: SavedPromptResult[]; +}; + +type SavedPromptResult = Omit & { + expectedToolCalls: string; + modelResponses: SavedModelResponse[]; +}; + +type SavedModelResponse = Omit & { + llmToolCalls: string; +}; + export class MongoDBBasedResultStorage implements AccuracyResultStorage { private client: MongoClient; - private resultCollection: Collection; + private resultCollection: Collection; constructor(connectionString: string, database: string, collection: string) { this.client = new MongoClient(connectionString); - this.resultCollection = this.client.db(database).collection(collection); + this.resultCollection = this.client.db(database).collection(collection); } async getAccuracyResult(commitSHA: string, runId?: string): Promise { @@ -28,11 +47,14 @@ export class MongoDBBasedResultStorage implements AccuracyResultStorage { // for commit is when you want the last successful run of that // particular commit. { commitSHA, runStatus: AccuracyRunStatus.Done }; - return await this.resultCollection.findOne(filters, { + + const result = await this.resultCollection.findOne(filters, { sort: { createdOn: -1, }, }); + + return result ? this.deserializeSavedResult(result) : result; } async updateRunStatus(commitSHA: string, runId: string, status: AccuracyRunStatuses): Promise { @@ -59,67 +81,14 @@ export class MongoDBBasedResultStorage implements AccuracyResultStorage { expectedToolCalls: ExpectedToolCall[]; modelResponse: ModelResponse; }): Promise { - const savedModelResponse: ModelResponse = { ...modelResponse }; - for (const field of OMITTED_MODEL_RESPONSE_FIELDS) { - delete savedModelResponse[field]; - } - - await this.resultCollection.updateOne( - { commitSHA, runId }, - { - $setOnInsert: { - runStatus: AccuracyRunStatus.InProgress, - createdOn: Date.now(), - commitSHA, - runId, - promptResults: [], - }, - }, - { upsert: true } - ); - - await this.resultCollection.updateOne( - { - commitSHA, - runId, - "promptResults.prompt": { $ne: prompt }, - }, - { - $push: { - promptResults: { prompt, expectedToolCalls, modelResponses: [] }, - }, - } - ); + const expectedToolCallsToSave = JSON.stringify(expectedToolCalls); + const modelResponseToSave: SavedModelResponse = { + ...modelResponse, + llmToolCalls: JSON.stringify(modelResponse.llmToolCalls), + }; - await this.resultCollection.updateOne( - { commitSHA, runId }, - { - $push: { - "promptResults.$[promptElement].modelResponses": savedModelResponse, - }, - }, - { - arrayFilters: [{ "promptElement.prompt": prompt }], - } - ); - } - - async saveModelResponseForPromptAtomic({ - commitSHA, - runId, - prompt, - expectedToolCalls, - modelResponse, - }: { - commitSHA: string; - runId: string; - prompt: string; - expectedToolCalls: ExpectedToolCall[]; - modelResponse: ModelResponse; - }): Promise { - const savedModelResponse: ModelResponse = { ...modelResponse }; for (const field of OMITTED_MODEL_RESPONSE_FIELDS) { - delete savedModelResponse[field]; + delete modelResponseToSave[field]; } await this.resultCollection.updateOne( @@ -127,62 +96,62 @@ export class MongoDBBasedResultStorage implements AccuracyResultStorage { [ { $set: { - runStatus: { - $ifNull: ["$runStatus", AccuracyRunStatus.InProgress], - }, - createdOn: { - $ifNull: ["$createdOn", Date.now()], + runStatus: { $ifNull: ["$runStatus", AccuracyRunStatus.InProgress] }, + createdOn: { $ifNull: ["$createdOn", Date.now()] }, + commitSHA: { $ifNull: ["$commitSHA", commitSHA] }, + runId: { $ifNull: ["$runId", runId] }, + promptResults: { + $ifNull: ["$promptResults", []], }, - commitSHA: commitSHA, - runId: runId, + }, + }, + { + $set: { promptResults: { $let: { vars: { - existingPrompts: { $ifNull: ["$promptResults", []] }, - promptExists: { - $in: [ - prompt, - { - $ifNull: [ - { $map: { input: "$promptResults", as: "pr", in: "$$pr.prompt" } }, - [], - ], - }, - ], + existingPromptIndex: { + $indexOfArray: ["$promptResults.prompt", prompt], }, }, in: { - $map: { - input: { - $cond: { - if: "$$promptExists", - then: "$$existingPrompts", - else: { - $concatArrays: [ - "$$existingPrompts", - [{ prompt, expectedToolCalls, modelResponses: [] }], - ], - }, - }, - }, - as: "promptResult", - in: { - $cond: { - if: { $eq: ["$$promptResult.prompt", prompt] }, - then: { - prompt: "$$promptResult.prompt", - expectedToolCalls: "$$promptResult.expectedToolCalls", - modelResponses: { - $concatArrays: [ - "$$promptResult.modelResponses", - [savedModelResponse], - ], + $cond: [ + { $eq: ["$$existingPromptIndex", -1] }, + { + $concatArrays: [ + "$promptResults", + [ + { + prompt, + expectedToolCalls: expectedToolCallsToSave, + modelResponses: [modelResponseToSave], }, + ], + ], + }, + { + $map: { + input: "$promptResults", + as: "promptResult", + in: { + $cond: [ + { $eq: ["$$promptResult.prompt", prompt] }, + { + prompt: "$$promptResult.prompt", + expectedToolCalls: expectedToolCallsToSave, + modelResponses: { + $concatArrays: [ + "$$promptResult.modelResponses", + [modelResponseToSave], + ], + }, + }, + "$$promptResult", + ], }, - else: "$$promptResult", }, }, - }, + ], }, }, }, @@ -193,6 +162,24 @@ export class MongoDBBasedResultStorage implements AccuracyResultStorage { ); } + private deserializeSavedResult(result: SavedAccuracyResult): AccuracyResult { + return { + ...result, + promptResults: result.promptResults.map((result) => { + return { + ...result, + expectedToolCalls: JSON.parse(result.expectedToolCalls) as ExpectedToolCall[], + modelResponses: result.modelResponses.map((response) => { + return { + ...response, + llmToolCalls: JSON.parse(response.llmToolCalls) as LLMToolCall[], + }; + }), + }; + }), + }; + } + async close(): Promise { await this.client.close(); } From acba3b41a5715703fb4b928cd78a7ea1feae2731 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 15 Jul 2025 12:43:22 +0200 Subject: [PATCH 68/91] chore: use $literal instead of serializing the tool calls --- .../mongodb-storage.ts | 59 ++++--------------- 1 file changed, 11 insertions(+), 48 deletions(-) diff --git a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts index 21ccb4e2..9c9cc873 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts +++ b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts @@ -13,30 +13,13 @@ import { // Omitting these as they might contain large chunk of texts const OMITTED_MODEL_RESPONSE_FIELDS: (keyof ModelResponse)[] = ["messages", "text"]; -// The LLMToolCalls and ExpectedToolCalls are expected to have mongodb operators -// nested in the objects. This interferes with the update operation that we do -// on the accuracy result document to save the model responses which is why we -// serialize them before saving and deserialize them on fetch. -type SavedAccuracyResult = Omit & { - promptResults: SavedPromptResult[]; -}; - -type SavedPromptResult = Omit & { - expectedToolCalls: string; - modelResponses: SavedModelResponse[]; -}; - -type SavedModelResponse = Omit & { - llmToolCalls: string; -}; - export class MongoDBBasedResultStorage implements AccuracyResultStorage { private client: MongoClient; - private resultCollection: Collection; + private resultCollection: Collection; constructor(connectionString: string, database: string, collection: string) { this.client = new MongoClient(connectionString); - this.resultCollection = this.client.db(database).collection(collection); + this.resultCollection = this.client.db(database).collection(collection); } async getAccuracyResult(commitSHA: string, runId?: string): Promise { @@ -48,13 +31,11 @@ export class MongoDBBasedResultStorage implements AccuracyResultStorage { // particular commit. { commitSHA, runStatus: AccuracyRunStatus.Done }; - const result = await this.resultCollection.findOne(filters, { + return await this.resultCollection.findOne(filters, { sort: { createdOn: -1, }, }); - - return result ? this.deserializeSavedResult(result) : result; } async updateRunStatus(commitSHA: string, runId: string, status: AccuracyRunStatuses): Promise { @@ -81,10 +62,8 @@ export class MongoDBBasedResultStorage implements AccuracyResultStorage { expectedToolCalls: ExpectedToolCall[]; modelResponse: ModelResponse; }): Promise { - const expectedToolCallsToSave = JSON.stringify(expectedToolCalls); - const modelResponseToSave: SavedModelResponse = { + const modelResponseToSave: ModelResponse = { ...modelResponse, - llmToolCalls: JSON.stringify(modelResponse.llmToolCalls), }; for (const field of OMITTED_MODEL_RESPONSE_FIELDS) { @@ -122,9 +101,11 @@ export class MongoDBBasedResultStorage implements AccuracyResultStorage { "$promptResults", [ { - prompt, - expectedToolCalls: expectedToolCallsToSave, - modelResponses: [modelResponseToSave], + $literal: { + prompt, + expectedToolCalls, + modelResponses: [modelResponseToSave], + }, }, ], ], @@ -138,11 +119,11 @@ export class MongoDBBasedResultStorage implements AccuracyResultStorage { { $eq: ["$$promptResult.prompt", prompt] }, { prompt: "$$promptResult.prompt", - expectedToolCalls: expectedToolCallsToSave, + expectedToolCalls, modelResponses: { $concatArrays: [ "$$promptResult.modelResponses", - [modelResponseToSave], + [{ $literal: modelResponseToSave }], ], }, }, @@ -162,24 +143,6 @@ export class MongoDBBasedResultStorage implements AccuracyResultStorage { ); } - private deserializeSavedResult(result: SavedAccuracyResult): AccuracyResult { - return { - ...result, - promptResults: result.promptResults.map((result) => { - return { - ...result, - expectedToolCalls: JSON.parse(result.expectedToolCalls) as ExpectedToolCall[], - modelResponses: result.modelResponses.map((response) => { - return { - ...response, - llmToolCalls: JSON.parse(response.llmToolCalls) as LLMToolCall[], - }; - }), - }; - }), - }; - } - async close(): Promise { await this.client.close(); } From f0d9c799bd049a0f5e8f0fa974766d4250c69677 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 15 Jul 2025 12:46:26 +0200 Subject: [PATCH 69/91] chore: don't import what is not used --- tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts index 9c9cc873..8754e97c 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts +++ b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts @@ -5,9 +5,7 @@ import { AccuracyRunStatus, AccuracyRunStatuses, ExpectedToolCall, - LLMToolCall, ModelResponse, - PromptResult, } from "./result-storage.js"; // Omitting these as they might contain large chunk of texts From 7798eb1cdfe47927e658af799d50ed2e65ef5e5e Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 15 Jul 2025 13:29:40 +0200 Subject: [PATCH 70/91] chore: should use $literal also for expectedToolCalls --- tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts index 8754e97c..91ba8c61 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts +++ b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts @@ -117,7 +117,9 @@ export class MongoDBBasedResultStorage implements AccuracyResultStorage { { $eq: ["$$promptResult.prompt", prompt] }, { prompt: "$$promptResult.prompt", - expectedToolCalls, + expectedToolCalls: { + $literal: expectedToolCalls, + }, modelResponses: { $concatArrays: [ "$$promptResult.modelResponses", From f303bb49cb064b0c748ce315fa5ae6d2a3379087 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 15 Jul 2025 13:53:16 +0200 Subject: [PATCH 71/91] chore: should recreate comment and hide previous one --- .github/workflows/accuracy-tests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index 57a4fa28..e5f08d51 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -49,4 +49,7 @@ jobs: if: github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests' uses: marocchino/sticky-pull-request-comment@d2ad0de260ae8b0235ce059e63f2949ba9e05943 # v2 with: + # Hides the previous comment and add a comment at the end + hide_and_recreate: true + hide_classify: "OUTDATED" path: .accuracy/test-brief.md From eb24505357001ff54d68bc96997a89a199b1f1e6 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Wed, 16 Jul 2025 13:26:16 +0200 Subject: [PATCH 72/91] chore: rebase fixes and move to vitest --- package-lock.json | 36 +++++++++++++++++-- scripts/accuracy/run-accuracy-tests.sh | 16 +++------ .../accuracy-result-storage/disk-storage.ts | 5 +-- tests/accuracy/sdk/describe-accuracy-tests.ts | 1 + tests/unit/accuracy-scorer.test.ts | 1 + vitest.config.ts | 26 +++++++++++++- 6 files changed, 68 insertions(+), 17 deletions(-) diff --git a/package-lock.json b/package-lock.json index 5f32bb57..0481dfa0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -35,7 +35,6 @@ "@ai-sdk/google": "^1.2.22", "@ai-sdk/openai": "^1.3.23", "@eslint/js": "^9.30.1", - "@himanshusinghs/google": "^1.2.11", "@modelcontextprotocol/inspector": "^0.16.0", "@redocly/cli": "^1.34.4", "@types/node": "^24.0.12", @@ -55,12 +54,12 @@ "openapi-typescript": "^7.8.0", "prettier": "^3.6.2", "proper-lockfile": "^4.1.2", + "simple-git": "^3.28.0", "tsx": "^4.20.3", "typescript": "^5.8.3", "typescript-eslint": "^8.36.0", - "vitest": "^3.2.4", - "simple-git": "^3.28.0", "uuid": "^11.1.0", + "vitest": "^3.2.4", "yaml": "^2.8.0" }, "engines": { @@ -9283,6 +9282,37 @@ "integrity": "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==", "license": "MIT" }, + "node_modules/jsondiffpatch": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/jsondiffpatch/-/jsondiffpatch-0.6.0.tgz", + "integrity": "sha512-3QItJOXp2AP1uv7waBkao5nCvhEv+QmJAd38Ybq7wNI74Q+BBmnLn4EDKz6yI9xGAIQoUF87qHt+kc1IVxB4zQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/diff-match-patch": "^1.0.36", + "chalk": "^5.3.0", + "diff-match-patch": "^1.0.5" + }, + "bin": { + "jsondiffpatch": "bin/jsondiffpatch.js" + }, + "engines": { + "node": "^18.0.0 || >=20.0.0" + } + }, + "node_modules/jsondiffpatch/node_modules/chalk": { + "version": "5.4.1", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-5.4.1.tgz", + "integrity": "sha512-zgVZuo2WcZgfUEmsn6eO3kINexW8RAE4maiQ8QNs8CtpPCSyMiYsULR3HQYkm3w8FIA3SberyMJMSldGsW+U3w==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^12.17.0 || ^14.13 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/chalk/chalk?sponsor=1" + } + }, "node_modules/jsonpath-plus": { "version": "10.3.0", "resolved": "https://registry.npmjs.org/jsonpath-plus/-/jsonpath-plus-10.3.0.tgz", diff --git a/scripts/accuracy/run-accuracy-tests.sh b/scripts/accuracy/run-accuracy-tests.sh index a8b08532..9b84edcb 100644 --- a/scripts/accuracy/run-accuracy-tests.sh +++ b/scripts/accuracy/run-accuracy-tests.sh @@ -16,18 +16,12 @@ export MDB_ACCURACY_RUN_ID=$(npx uuid v4) # By default we run all the tests under tests/accuracy folder unless a path is # specified in the command line. Such as: # npm run test:accuracy -- tests/accuracy/some-test.test.ts -if [ $# -gt 0 ]; then - TEST_PATH_PATTERN="$1" - shift -else - TEST_PATH_PATTERN="tests/accuracy" -fi -echo "Running accuracy tests with MDB_ACCURACY_RUN_ID '$MDB_ACCURACY_RUN_ID' and TEST_PATH_PATTERN '$TEST_PATH_PATTERN'" -node --experimental-vm-modules node_modules/jest/bin/jest.js --bail --testPathPatterns "$TEST_PATH_PATTERN" "$@" +echo "Running accuracy tests with MDB_ACCURACY_RUN_ID '$MDB_ACCURACY_RUN_ID'" +vitest --config vitest.config.ts --project=accuracy --coverage=false --run --testTimeout=3600000 "$@" # Preserving the exit code from test run to correctly notify in the CI # environments when the tests fail. -JEST_EXIT_CODE=$? +TEST_EXIT_CODE=$? # Each test run submits an accuracy result with the accuracyRunStatus: # "in-progress". When all the tests are done and jest exits with an exit code of @@ -42,10 +36,10 @@ JEST_EXIT_CODE=$? # This is necessary when comparing one accuracy run with another as we wouldn't # want to compare against an incomplete run. -export MDB_ACCURACY_RUN_STATUS=$([ $JEST_EXIT_CODE -eq 0 ] && echo "done" || echo "failed") +export MDB_ACCURACY_RUN_STATUS=$([ $TEST_EXIT_CODE -eq 0 ] && echo "done" || echo "failed") npx tsx scripts/accuracy/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to '$MDB_ACCURACY_RUN_STATUS'" # This is optional but we do it anyways to generate a readable summary of report. npx tsx scripts/accuracy/generate-test-summary.ts || echo "Warning: Failed to generate test summary HTML report" -exit $JEST_EXIT_CODE \ No newline at end of file +exit $TEST_EXIT_CODE \ No newline at end of file diff --git a/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts index e138257b..51be8c7f 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts +++ b/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts @@ -52,6 +52,7 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { // commit so that we can use that during baseline comparison. if (status === AccuracyRunStatus.Done) { const latestResultFilePath = this.getLatestResultFilePath(commitSHA); + await this.ensureFileWithInitialData(latestResultFilePath, JSON.stringify({})); await this.withFileLock(latestResultFilePath, async () => { await fs.unlink(latestResultFilePath); await fs.link(resultFilePath, latestResultFilePath); @@ -86,7 +87,7 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { ], }; const resultFilePath = this.getAccuracyResultFilePath(commitSHA, runId); - const { fileCreatedWithInitialData } = await this.ensureAccuracyResultFile( + const { fileCreatedWithInitialData } = await this.ensureFileWithInitialData( resultFilePath, JSON.stringify(initialData, null, 2) ); @@ -143,7 +144,7 @@ export class DiskBasedResultStorage implements AccuracyResultStorage { } } - private async ensureAccuracyResultFile( + private async ensureFileWithInitialData( filePath: string, initialData: string ): Promise<{ diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 02e1b9e2..bcd58a69 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -1,3 +1,4 @@ +import { describe, it, beforeAll, beforeEach, afterAll } from "vitest"; import { TestableModels } from "./models.js"; import { calculateToolCallingAccuracy } from "./accuracy-scorer.js"; import { getVercelToolCallingAgent, VercelAgent } from "./agent.js"; diff --git a/tests/unit/accuracy-scorer.test.ts b/tests/unit/accuracy-scorer.test.ts index cb844686..8519e492 100644 --- a/tests/unit/accuracy-scorer.test.ts +++ b/tests/unit/accuracy-scorer.test.ts @@ -1,3 +1,4 @@ +import { describe, expect, it } from "vitest"; import { calculateToolCallingAccuracy } from "../accuracy/sdk/accuracy-scorer.js"; import { ExpectedToolCall, LLMToolCall } from "../accuracy/sdk/accuracy-result-storage/result-storage.js"; diff --git a/vitest.config.ts b/vitest.config.ts index 31090929..7fa9d802 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -1,15 +1,39 @@ import { defineConfig } from "vitest/config"; +// Shared exclusions for all projects +// Ref: https://vitest.dev/config/#exclude +const vitestDefaultExcludes = [ + "**/node_modules/**", + "**/dist/**", + "**/cypress/**", + "**/.{idea,git,cache,output,temp}/**", + "**/{karma,rollup,webpack,vite,vitest,jest,ava,babel,nyc,cypress,tsup,build,eslint,prettier}.config.*", +]; + export default defineConfig({ test: { environment: "node", testTimeout: 3600000, hookTimeout: 3600000, - include: ["**/*.test.ts"], setupFiles: ["./tests/setup.ts"], coverage: { exclude: ["node_modules", "tests", "dist"], reporter: ["lcov"], }, + projects: [ + { + test: { + name: "unit-and-integration", + include: ["**/*.test.ts"], + exclude: [...vitestDefaultExcludes, "tests/accuracy/**"], + }, + }, + { + test: { + name: "accuracy", + include: ["**/accuracy/*.test.ts"], + }, + }, + ], }, }); From 8db0e6fd2e1fa9f78fed62e4e2532a6f9fa4f353 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Wed, 16 Jul 2025 15:49:33 +0200 Subject: [PATCH 73/91] chore: run unit and integration for test script --- package.json | 2 +- scripts/accuracy/run-accuracy-tests.sh | 2 +- vitest.config.ts | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index 8958a95f..bd545ff0 100644 --- a/package.json +++ b/package.json @@ -29,7 +29,7 @@ "check:types": "tsc --noEmit --project tsconfig.json", "reformat": "prettier --write .", "generate": "./scripts/generate.sh", - "test": "vitest --coverage", + "test": "vitest --project unit-and-integration --coverage", "pretest:accuracy": "npm run build:compile", "test:accuracy": "sh ./scripts/accuracy/run-accuracy-tests.sh" }, diff --git a/scripts/accuracy/run-accuracy-tests.sh b/scripts/accuracy/run-accuracy-tests.sh index 9b84edcb..c8607de8 100644 --- a/scripts/accuracy/run-accuracy-tests.sh +++ b/scripts/accuracy/run-accuracy-tests.sh @@ -17,7 +17,7 @@ export MDB_ACCURACY_RUN_ID=$(npx uuid v4) # specified in the command line. Such as: # npm run test:accuracy -- tests/accuracy/some-test.test.ts echo "Running accuracy tests with MDB_ACCURACY_RUN_ID '$MDB_ACCURACY_RUN_ID'" -vitest --config vitest.config.ts --project=accuracy --coverage=false --run --testTimeout=3600000 "$@" +vitest --config vitest.config.ts --project=accuracy --coverage=false --run "$@" # Preserving the exit code from test run to correctly notify in the CI # environments when the tests fail. diff --git a/vitest.config.ts b/vitest.config.ts index 7fa9d802..2a42cecb 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -22,6 +22,7 @@ export default defineConfig({ }, projects: [ { + extends: true, test: { name: "unit-and-integration", include: ["**/*.test.ts"], @@ -29,6 +30,7 @@ export default defineConfig({ }, }, { + extends: true, test: { name: "accuracy", include: ["**/accuracy/*.test.ts"], From 83157d31e1c0efcef093b1206172eb861df143a0 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Wed, 16 Jul 2025 18:29:46 +0200 Subject: [PATCH 74/91] chore: PR feedback - well defined Model types - move getAllAvailableModels() inside the test setup --- tests/accuracy/aggregate.test.ts | 3 +-- tests/accuracy/collection-indexes.test.ts | 3 +-- tests/accuracy/collection-schema.test.ts | 3 +-- tests/accuracy/collection-storage-size.test.ts | 3 +-- tests/accuracy/count.test.ts | 3 +-- tests/accuracy/create-collection.test.ts | 3 +-- tests/accuracy/create-index.test.ts | 3 +-- tests/accuracy/db-stats.test.ts | 3 +-- tests/accuracy/delete-many.test.ts | 3 +-- tests/accuracy/drop-collection.test.ts | 3 +-- tests/accuracy/drop-database.test.ts | 3 +-- tests/accuracy/explain.test.ts | 3 +-- tests/accuracy/find.test.ts | 3 +-- tests/accuracy/insert-many.test.ts | 3 +-- tests/accuracy/list-collections.test.ts | 3 +-- tests/accuracy/list-databases.test.ts | 3 +-- tests/accuracy/logs.test.ts | 3 +-- tests/accuracy/rename-collection.test.ts | 3 +-- tests/accuracy/sdk/describe-accuracy-tests.ts | 5 +++-- tests/accuracy/sdk/models.ts | 10 ++++------ tests/accuracy/update-many.test.ts | 3 +-- 21 files changed, 26 insertions(+), 46 deletions(-) diff --git a/tests/accuracy/aggregate.test.ts b/tests/accuracy/aggregate.test.ts index 30a5a0e3..379459b2 100644 --- a/tests/accuracy/aggregate.test.ts +++ b/tests/accuracy/aggregate.test.ts @@ -1,7 +1,6 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; -describeAccuracyTests(getAvailableModels(), [ +describeAccuracyTests([ { prompt: "Group all the movies in 'mflix.movies' namespace by 'release_year' and give me a count of them", expectedToolCalls: [ diff --git a/tests/accuracy/collection-indexes.test.ts b/tests/accuracy/collection-indexes.test.ts index dab7d317..0d4ca17a 100644 --- a/tests/accuracy/collection-indexes.test.ts +++ b/tests/accuracy/collection-indexes.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsCollectionIndexes(prompt: string): AccuracyTestConfig { @@ -17,7 +16,7 @@ function callsCollectionIndexes(prompt: string): AccuracyTestConfig { }; } -describeAccuracyTests(getAvailableModels(), [ +describeAccuracyTests([ callsCollectionIndexes("How many indexes do I have in 'mflix.movies' namespace?"), callsCollectionIndexes("List all the indexes in movies collection in mflix database"), callsCollectionIndexes( diff --git a/tests/accuracy/collection-schema.test.ts b/tests/accuracy/collection-schema.test.ts index f2f22a88..bf371f0b 100644 --- a/tests/accuracy/collection-schema.test.ts +++ b/tests/accuracy/collection-schema.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsCollectionSchema(prompt: string): AccuracyTestConfig { @@ -17,7 +16,7 @@ function callsCollectionSchema(prompt: string): AccuracyTestConfig { }; } -describeAccuracyTests(getAvailableModels(), [ +describeAccuracyTests([ callsCollectionSchema("Is there a title field in 'db1.coll1' namespace?"), callsCollectionSchema("What is the type of value stored in title field in coll1 collection in db1 database?"), ]); diff --git a/tests/accuracy/collection-storage-size.test.ts b/tests/accuracy/collection-storage-size.test.ts index 2bd2f021..387eccb4 100644 --- a/tests/accuracy/collection-storage-size.test.ts +++ b/tests/accuracy/collection-storage-size.test.ts @@ -1,7 +1,6 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; -describeAccuracyTests(getAvailableModels(), [ +describeAccuracyTests([ { prompt: "What is the size of 'mflix.movies' namespace", expectedToolCalls: [ diff --git a/tests/accuracy/count.test.ts b/tests/accuracy/count.test.ts index 09db4678..24cd64fc 100644 --- a/tests/accuracy/count.test.ts +++ b/tests/accuracy/count.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsCountToolWithEmptyQuery(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig { @@ -38,7 +37,7 @@ function callsCountToolWithQuery( }; } -describeAccuracyTests(getAvailableModels(), [ +describeAccuracyTests([ callsCountToolWithEmptyQuery("Count number of documents in 'mflix.movies' namespace."), callsCountToolWithEmptyQuery( "How many documents are there in 'characters' collection in 'comics' database?", diff --git a/tests/accuracy/create-collection.test.ts b/tests/accuracy/create-collection.test.ts index 89d6980d..684f353c 100644 --- a/tests/accuracy/create-collection.test.ts +++ b/tests/accuracy/create-collection.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; import { ExpectedToolCall } from "./sdk/accuracy-result-storage/result-storage.js"; @@ -27,7 +26,7 @@ function callsCreateCollectionWithListCollections(prompt: string, expectedToolCa }; } -describeAccuracyTests(getAvailableModels(), [ +describeAccuracyTests([ callsCreateCollection("Create a new namespace 'mflix.documentaries'", "mflix", "documentaries"), callsCreateCollection("Create a new collection villains in comics database", "comics", "villains"), callsCreateCollectionWithListCollections( diff --git a/tests/accuracy/create-index.test.ts b/tests/accuracy/create-index.test.ts index 6dae12e5..7868fe22 100644 --- a/tests/accuracy/create-index.test.ts +++ b/tests/accuracy/create-index.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsCreateIndex(prompt: string, indexKeys: Record): AccuracyTestConfig { @@ -18,7 +17,7 @@ function callsCreateIndex(prompt: string, indexKeys: Record): A }; } -describeAccuracyTests(getAvailableModels(), [ +describeAccuracyTests([ callsCreateIndex( "Create an index that covers the following query on 'mflix.movies' namespace - { \"release_year\": 1992 }", { diff --git a/tests/accuracy/db-stats.test.ts b/tests/accuracy/db-stats.test.ts index 656eccc2..5c599e24 100644 --- a/tests/accuracy/db-stats.test.ts +++ b/tests/accuracy/db-stats.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsListDatabases(prompt: string, database = "mflix"): AccuracyTestConfig { @@ -16,4 +15,4 @@ function callsListDatabases(prompt: string, database = "mflix"): AccuracyTestCon }; } -describeAccuracyTests(getAvailableModels(), [callsListDatabases("What is the size occupied by database mflix?")]); +describeAccuracyTests([callsListDatabases("What is the size occupied by database mflix?")]); diff --git a/tests/accuracy/delete-many.test.ts b/tests/accuracy/delete-many.test.ts index c0dd4d51..21296a07 100644 --- a/tests/accuracy/delete-many.test.ts +++ b/tests/accuracy/delete-many.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsDeleteManyWithEmptyFilters(prompt: string): AccuracyTestConfig { @@ -33,7 +32,7 @@ function callsDeleteManyWithFilters(prompt: string): AccuracyTestConfig { }; } -describeAccuracyTests(getAvailableModels(), [ +describeAccuracyTests([ callsDeleteManyWithEmptyFilters("Delete all the documents from 'mflix.movies' namespace"), callsDeleteManyWithEmptyFilters("Purge the collection 'movies' in database 'mflix'"), callsDeleteManyWithFilters("Remove all the documents from namespace 'mflix.movies' where runtime is less than 100"), diff --git a/tests/accuracy/drop-collection.test.ts b/tests/accuracy/drop-collection.test.ts index a9f2494c..5c96defe 100644 --- a/tests/accuracy/drop-collection.test.ts +++ b/tests/accuracy/drop-collection.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; import { ExpectedToolCall } from "./sdk/accuracy-result-storage/result-storage.js"; @@ -25,7 +24,7 @@ function callsDropCollection(prompt: string, expectedToolCalls: ExpectedToolCall }; } -describeAccuracyTests(getAvailableModels(), [ +describeAccuracyTests([ onlyCallsDropCollection("Remove mflix.movies namespace from my cluster."), onlyCallsDropCollection("Drop movies collection from mflix database."), callsDropCollection("Remove books collection from which ever database contains it.", [ diff --git a/tests/accuracy/drop-database.test.ts b/tests/accuracy/drop-database.test.ts index 74876658..c7690a77 100644 --- a/tests/accuracy/drop-database.test.ts +++ b/tests/accuracy/drop-database.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; import { ExpectedToolCall } from "./sdk/accuracy-result-storage/result-storage.js"; @@ -24,7 +23,7 @@ function callsDropDatabase(prompt: string, expectedToolCalls: ExpectedToolCall[] }; } -describeAccuracyTests(getAvailableModels(), [ +describeAccuracyTests([ onlyCallsDropDatabase("Remove mflix database from my cluster."), onlyCallsDropDatabase("Drop database named mflix."), callsDropDatabase("If there is a mflix database in my cluster then drop it.", [ diff --git a/tests/accuracy/explain.test.ts b/tests/accuracy/explain.test.ts index 4a539c48..4b58046c 100644 --- a/tests/accuracy/explain.test.ts +++ b/tests/accuracy/explain.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsExplain(prompt: string, method: Record): AccuracyTestConfig { @@ -51,7 +50,7 @@ const callsExplainWithCount = (prompt: string) => * because we are using Zod.union, when we probably should've used * Zod.discriminatedUnion */ -describeAccuracyTests(getAvailableModels(), [ +describeAccuracyTests([ callsExplainWithFind( `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` ), diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts index 02c02cd1..125afb67 100644 --- a/tests/accuracy/find.test.ts +++ b/tests/accuracy/find.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsFindNoFilter(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig { @@ -93,7 +92,7 @@ function callsFindWithFilterSortAndLimit( }; } -describeAccuracyTests(getAvailableModels(), [ +describeAccuracyTests([ callsFindNoFilter("List all the movies in 'mflix.movies' namespace."), callsFindNoFilter("List all the documents in 'comics.books' namespace.", "comics", "books"), callsFindWithFilter("Find all the movies in 'mflix.movies' namespace with runtime less than 100.", { diff --git a/tests/accuracy/insert-many.test.ts b/tests/accuracy/insert-many.test.ts index 4ce15bb8..c7dc238f 100644 --- a/tests/accuracy/insert-many.test.ts +++ b/tests/accuracy/insert-many.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsInsertMany(prompt: string): AccuracyTestConfig { @@ -47,7 +46,7 @@ function callsEmptyInsertMany(prompt: string) { }; } -describeAccuracyTests(getAvailableModels(), [ +describeAccuracyTests([ callsInsertMany( [ "In my namespace 'mflix.movies', insert 3 documents each with the following fields:", diff --git a/tests/accuracy/list-collections.test.ts b/tests/accuracy/list-collections.test.ts index 78a14f34..4e8a7f0a 100644 --- a/tests/accuracy/list-collections.test.ts +++ b/tests/accuracy/list-collections.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsListCollections(prompt: string): AccuracyTestConfig { @@ -48,7 +47,7 @@ function callsListDatabasesAndListCollections(prompt: string): AccuracyTestConfi }; } -describeAccuracyTests(getAvailableModels(), [ +describeAccuracyTests([ callsListCollections("How many collections do I have in database mflix?"), callsListCollections("List all the collections in my MongoDB database mflix."), callsListCollections("Is there a shows collection in my MongoDB database mflix?"), diff --git a/tests/accuracy/list-databases.test.ts b/tests/accuracy/list-databases.test.ts index 97a8ce27..935a0c0b 100644 --- a/tests/accuracy/list-databases.test.ts +++ b/tests/accuracy/list-databases.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsListDatabases(prompt: string): AccuracyTestConfig { @@ -14,7 +13,7 @@ function callsListDatabases(prompt: string): AccuracyTestConfig { }; } -describeAccuracyTests(getAvailableModels(), [ +describeAccuracyTests([ callsListDatabases("How many databases do I have?"), callsListDatabases("List all the databases that I have in my clusters"), callsListDatabases("Is there a mflix database in my cluster?"), diff --git a/tests/accuracy/logs.test.ts b/tests/accuracy/logs.test.ts index 2dbe8d09..1cf3c24e 100644 --- a/tests/accuracy/logs.test.ts +++ b/tests/accuracy/logs.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; import { ExpectedToolCall } from "./sdk/accuracy-result-storage/result-storage.js"; @@ -10,7 +9,7 @@ function callsLogsTool(prompt: string, toolCall: ExpectedToolCall): AccuracyTest }; } -describeAccuracyTests(getAvailableModels(), [ +describeAccuracyTests([ callsLogsTool("Were there any startup warnings for my MongoDB server?", { toolName: "mongodb-logs", parameters: { diff --git a/tests/accuracy/rename-collection.test.ts b/tests/accuracy/rename-collection.test.ts index 549a02b9..b4ba88fc 100644 --- a/tests/accuracy/rename-collection.test.ts +++ b/tests/accuracy/rename-collection.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsRenameCollection(prompt: string): AccuracyTestConfig { @@ -35,7 +34,7 @@ function callsRenameCollectionWithDropTarget(prompt: string): AccuracyTestConfig }; } -describeAccuracyTests(getAvailableModels(), [ +describeAccuracyTests([ callsRenameCollection("Rename my 'mflix.movies' namespace to 'mflix.new_movies'"), callsRenameCollectionWithDropTarget( "Rename my 'mflix.movies' namespace to 'mflix.new_movies' while removing the old namespace." diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index bcd58a69..4f3c9633 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -1,5 +1,5 @@ import { describe, it, beforeAll, beforeEach, afterAll } from "vitest"; -import { TestableModels } from "./models.js"; +import { getAvailableModels } from "./models.js"; import { calculateToolCallingAccuracy } from "./accuracy-scorer.js"; import { getVercelToolCallingAgent, VercelAgent } from "./agent.js"; import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/tools/mongodb/mongodbHelpers.js"; @@ -41,11 +41,12 @@ export interface AccuracyTestConfig { mockedTools?: MockedTools; } -export function describeAccuracyTests(models: TestableModels, accuracyTestConfigs: AccuracyTestConfig[]) { +export function describeAccuracyTests(accuracyTestConfigs: AccuracyTestConfig[]) { if (!process.env.MDB_ACCURACY_RUN_ID) { throw new Error("MDB_ACCURACY_RUN_ID env variable is required for accuracy test runs!"); } + const models = getAvailableModels(); if (!models.length) { throw new Error("No models available to test. Ensure that the API keys are properly setup!"); } diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index 928b7793..02d2739b 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -4,12 +4,12 @@ import { createAzure } from "@ai-sdk/azure"; import { createOpenAI } from "@ai-sdk/openai"; import { ollama } from "ollama-ai-provider"; -export interface Model

{ +export interface Model { readonly modelName: string; readonly provider: string; readonly displayName: string; isAvailable(): boolean; - getModel(): P; + getModel(): VercelModel; } export class OpenAIModel implements Model { @@ -88,10 +88,8 @@ export class OllamaModel implements Model { } } -const ALL_TESTABLE_MODELS = [new AzureOpenAIModel("gpt-4o")]; +const ALL_TESTABLE_MODELS: Model[] = [new AzureOpenAIModel("gpt-4o")]; -export type TestableModels = ReturnType; - -export function getAvailableModels() { +export function getAvailableModels(): Model[] { return ALL_TESTABLE_MODELS.filter((model) => model.isAvailable()); } diff --git a/tests/accuracy/update-many.test.ts b/tests/accuracy/update-many.test.ts index 86f96705..0975a2b9 100644 --- a/tests/accuracy/update-many.test.ts +++ b/tests/accuracy/update-many.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsUpdateManyWithEmptyFilters(prompt: string): AccuracyTestConfig { @@ -43,7 +42,7 @@ function callsUpdateManyWithFilters(prompt: string, filter: Record Date: Thu, 17 Jul 2025 10:30:53 +0200 Subject: [PATCH 75/91] chore: add return type annotation for accuracy testing client --- tests/accuracy/sdk/accuracy-testing-client.ts | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/accuracy/sdk/accuracy-testing-client.ts b/tests/accuracy/sdk/accuracy-testing-client.ts index 25a224a1..e9f83877 100644 --- a/tests/accuracy/sdk/accuracy-testing-client.ts +++ b/tests/accuracy/sdk/accuracy-testing-client.ts @@ -5,6 +5,7 @@ import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" import { MCP_SERVER_CLI_SCRIPT } from "./constants.js"; import { LLMToolCall } from "./accuracy-result-storage/result-storage.js"; +import { VercelMCPClient, VercelMCPClientTools } from "./agent.js"; type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult | Promise; export type MockedTools = Record; @@ -22,15 +23,15 @@ export class AccuracyTestingClient { private mockedTools: MockedTools = {}; private llmToolCalls: LLMToolCall[] = []; - private constructor(private readonly vercelMCPClient: Awaited>) {} + private constructor(private readonly vercelMCPClient: VercelMCPClient) {} - async close() { + async close(): Promise { await this.vercelMCPClient?.close(); } - async vercelTools() { + async vercelTools(): Promise { const vercelTools = (await this.vercelMCPClient?.tools()) ?? {}; - const rewrappedVercelTools: typeof vercelTools = {}; + const rewrappedVercelTools: VercelMCPClientTools = {}; for (const [toolName, tool] of Object.entries(vercelTools)) { rewrappedVercelTools[toolName] = createVercelTool({ ...tool, @@ -65,20 +66,20 @@ export class AccuracyTestingClient { return rewrappedVercelTools; } - getLLMToolCalls() { + getLLMToolCalls(): LLMToolCall[] { return this.llmToolCalls; } - mockTools(mockedTools: MockedTools) { + mockTools(mockedTools: MockedTools): void { this.mockedTools = mockedTools; } - resetForTests() { + resetForTests(): void { this.mockTools({}); this.llmToolCalls = []; } - static async initializeClient(mdbConnectionString: string) { + static async initializeClient(mdbConnectionString: string): Promise { const clientTransport = new StdioClientTransport({ command: process.execPath, args: [MCP_SERVER_CLI_SCRIPT, "--connectionString", mdbConnectionString], From ba37196b4ca62e18150e875a49fb07c71c028cee Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 17 Jul 2025 10:46:44 +0200 Subject: [PATCH 76/91] chore: update test file names per naming convention --- .../{collection-indexes.test.ts => collectionIndexes.test.ts} | 0 .../{collection-schema.test.ts => collectionSchema.test.ts} | 0 ...lection-storage-size.test.ts => collectionStorageSize.test.ts} | 0 .../{create-collection.test.ts => createCollection.test.ts} | 0 tests/accuracy/{create-index.test.ts => createIndex.test.ts} | 0 tests/accuracy/{db-stats.test.ts => dbStats.test.ts} | 0 tests/accuracy/{delete-many.test.ts => deleteMany.test.ts} | 0 .../accuracy/{drop-collection.test.ts => dropCollection.test.ts} | 0 tests/accuracy/{drop-database.test.ts => dropDatabase.test.ts} | 0 tests/accuracy/{insert-many.test.ts => insertMany.test.ts} | 0 .../{list-collections.test.ts => listCollections.test.ts} | 0 tests/accuracy/{list-databases.test.ts => listDatabases.test.ts} | 0 .../{rename-collection.test.ts => renameCollection.test.ts} | 0 tests/accuracy/{update-many.test.ts => updateMany.test.ts} | 0 14 files changed, 0 insertions(+), 0 deletions(-) rename tests/accuracy/{collection-indexes.test.ts => collectionIndexes.test.ts} (100%) rename tests/accuracy/{collection-schema.test.ts => collectionSchema.test.ts} (100%) rename tests/accuracy/{collection-storage-size.test.ts => collectionStorageSize.test.ts} (100%) rename tests/accuracy/{create-collection.test.ts => createCollection.test.ts} (100%) rename tests/accuracy/{create-index.test.ts => createIndex.test.ts} (100%) rename tests/accuracy/{db-stats.test.ts => dbStats.test.ts} (100%) rename tests/accuracy/{delete-many.test.ts => deleteMany.test.ts} (100%) rename tests/accuracy/{drop-collection.test.ts => dropCollection.test.ts} (100%) rename tests/accuracy/{drop-database.test.ts => dropDatabase.test.ts} (100%) rename tests/accuracy/{insert-many.test.ts => insertMany.test.ts} (100%) rename tests/accuracy/{list-collections.test.ts => listCollections.test.ts} (100%) rename tests/accuracy/{list-databases.test.ts => listDatabases.test.ts} (100%) rename tests/accuracy/{rename-collection.test.ts => renameCollection.test.ts} (100%) rename tests/accuracy/{update-many.test.ts => updateMany.test.ts} (100%) diff --git a/tests/accuracy/collection-indexes.test.ts b/tests/accuracy/collectionIndexes.test.ts similarity index 100% rename from tests/accuracy/collection-indexes.test.ts rename to tests/accuracy/collectionIndexes.test.ts diff --git a/tests/accuracy/collection-schema.test.ts b/tests/accuracy/collectionSchema.test.ts similarity index 100% rename from tests/accuracy/collection-schema.test.ts rename to tests/accuracy/collectionSchema.test.ts diff --git a/tests/accuracy/collection-storage-size.test.ts b/tests/accuracy/collectionStorageSize.test.ts similarity index 100% rename from tests/accuracy/collection-storage-size.test.ts rename to tests/accuracy/collectionStorageSize.test.ts diff --git a/tests/accuracy/create-collection.test.ts b/tests/accuracy/createCollection.test.ts similarity index 100% rename from tests/accuracy/create-collection.test.ts rename to tests/accuracy/createCollection.test.ts diff --git a/tests/accuracy/create-index.test.ts b/tests/accuracy/createIndex.test.ts similarity index 100% rename from tests/accuracy/create-index.test.ts rename to tests/accuracy/createIndex.test.ts diff --git a/tests/accuracy/db-stats.test.ts b/tests/accuracy/dbStats.test.ts similarity index 100% rename from tests/accuracy/db-stats.test.ts rename to tests/accuracy/dbStats.test.ts diff --git a/tests/accuracy/delete-many.test.ts b/tests/accuracy/deleteMany.test.ts similarity index 100% rename from tests/accuracy/delete-many.test.ts rename to tests/accuracy/deleteMany.test.ts diff --git a/tests/accuracy/drop-collection.test.ts b/tests/accuracy/dropCollection.test.ts similarity index 100% rename from tests/accuracy/drop-collection.test.ts rename to tests/accuracy/dropCollection.test.ts diff --git a/tests/accuracy/drop-database.test.ts b/tests/accuracy/dropDatabase.test.ts similarity index 100% rename from tests/accuracy/drop-database.test.ts rename to tests/accuracy/dropDatabase.test.ts diff --git a/tests/accuracy/insert-many.test.ts b/tests/accuracy/insertMany.test.ts similarity index 100% rename from tests/accuracy/insert-many.test.ts rename to tests/accuracy/insertMany.test.ts diff --git a/tests/accuracy/list-collections.test.ts b/tests/accuracy/listCollections.test.ts similarity index 100% rename from tests/accuracy/list-collections.test.ts rename to tests/accuracy/listCollections.test.ts diff --git a/tests/accuracy/list-databases.test.ts b/tests/accuracy/listDatabases.test.ts similarity index 100% rename from tests/accuracy/list-databases.test.ts rename to tests/accuracy/listDatabases.test.ts diff --git a/tests/accuracy/rename-collection.test.ts b/tests/accuracy/renameCollection.test.ts similarity index 100% rename from tests/accuracy/rename-collection.test.ts rename to tests/accuracy/renameCollection.test.ts diff --git a/tests/accuracy/update-many.test.ts b/tests/accuracy/updateMany.test.ts similarity index 100% rename from tests/accuracy/update-many.test.ts rename to tests/accuracy/updateMany.test.ts From c2a51fd456b86619589528187db63ac8fdbc9fc6 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 17 Jul 2025 10:54:17 +0200 Subject: [PATCH 77/91] chore: update sdk file names per naming convention --- scripts/accuracy/generate-test-summary.ts | 6 +++--- scripts/accuracy/update-accuracy-run-status.ts | 6 +++--- tests/accuracy/aggregate.test.ts | 2 +- tests/accuracy/collectionIndexes.test.ts | 4 ++-- tests/accuracy/collectionSchema.test.ts | 4 ++-- tests/accuracy/collectionStorageSize.test.ts | 2 +- tests/accuracy/count.test.ts | 4 ++-- tests/accuracy/createCollection.test.ts | 6 +++--- tests/accuracy/createIndex.test.ts | 4 ++-- tests/accuracy/dbStats.test.ts | 4 ++-- tests/accuracy/deleteMany.test.ts | 4 ++-- tests/accuracy/dropCollection.test.ts | 6 +++--- tests/accuracy/dropDatabase.test.ts | 6 +++--- tests/accuracy/explain.test.ts | 4 ++-- tests/accuracy/find.test.ts | 4 ++-- tests/accuracy/insertMany.test.ts | 4 ++-- tests/accuracy/listCollections.test.ts | 4 ++-- tests/accuracy/listDatabases.test.ts | 4 ++-- tests/accuracy/logs.test.ts | 6 +++--- tests/accuracy/renameCollection.test.ts | 4 ++-- .../diskStorage.ts} | 2 +- .../getAccuracyResultStorage.ts} | 6 +++--- .../mongodbStorage.ts} | 2 +- .../resultStorage.ts} | 0 .../sdk/{accuracy-scorer.ts => accuracyScorer.ts} | 2 +- ...racy-testing-client.ts => accuracyTestingClient.ts} | 2 +- ...ribe-accuracy-tests.ts => describeAccuracyTests.ts} | 10 +++++----- tests/accuracy/sdk/{git-info.ts => gitInfo.ts} | 5 ----- tests/accuracy/updateMany.test.ts | 4 ++-- ...{accuracy-scorer.test.ts => accuracyScorer.test.ts} | 4 ++-- 30 files changed, 60 insertions(+), 65 deletions(-) rename tests/accuracy/sdk/{accuracy-result-storage/disk-storage.ts => accuracyResultStorage/diskStorage.ts} (99%) rename tests/accuracy/sdk/{accuracy-result-storage/get-accuracy-result-storage.ts => accuracyResultStorage/getAccuracyResultStorage.ts} (69%) rename tests/accuracy/sdk/{accuracy-result-storage/mongodb-storage.ts => accuracyResultStorage/mongodbStorage.ts} (99%) rename tests/accuracy/sdk/{accuracy-result-storage/result-storage.ts => accuracyResultStorage/resultStorage.ts} (100%) rename tests/accuracy/sdk/{accuracy-scorer.ts => accuracyScorer.ts} (97%) rename tests/accuracy/sdk/{accuracy-testing-client.ts => accuracyTestingClient.ts} (98%) rename tests/accuracy/sdk/{describe-accuracy-tests.ts => describeAccuracyTests.ts} (93%) rename tests/accuracy/sdk/{git-info.ts => gitInfo.ts} (50%) rename tests/unit/{accuracy-scorer.test.ts => accuracyScorer.test.ts} (99%) diff --git a/scripts/accuracy/generate-test-summary.ts b/scripts/accuracy/generate-test-summary.ts index 2c73cc7c..3f5783d7 100644 --- a/scripts/accuracy/generate-test-summary.ts +++ b/scripts/accuracy/generate-test-summary.ts @@ -1,14 +1,14 @@ import path from "path"; import { readFile, writeFile, mkdir } from "fs/promises"; -import { getAccuracyResultStorage } from "../../tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.js"; +import { getAccuracyResultStorage } from "../../tests/accuracy/sdk/accuracy-result-storage/getAccuracyResultStorage.js"; import { AccuracyResult, AccuracyRunStatuses, ExpectedToolCall, LLMToolCall, ModelResponse, -} from "../../tests/accuracy/sdk/accuracy-result-storage/result-storage.js"; -import { getCommitSHA } from "../../tests/accuracy/sdk/git-info.js"; +} from "../../tests/accuracy/sdk/accuracy-result-storage/resultStorage.js"; +import { getCommitSHA } from "../../tests/accuracy/sdk/gitInfo.js"; import { HTML_TEST_SUMMARY_FILE, HTML_TESTS_SUMMARY_TEMPLATE, diff --git a/scripts/accuracy/update-accuracy-run-status.ts b/scripts/accuracy/update-accuracy-run-status.ts index 344ed86d..ae5544be 100644 --- a/scripts/accuracy/update-accuracy-run-status.ts +++ b/scripts/accuracy/update-accuracy-run-status.ts @@ -1,6 +1,6 @@ -import { getAccuracyResultStorage } from "../../tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.js"; -import { AccuracyRunStatus } from "../../tests/accuracy/sdk/accuracy-result-storage/result-storage.js"; -import { getCommitSHA } from "../../tests/accuracy/sdk/git-info.js"; +import { getAccuracyResultStorage } from "../../tests/accuracy/sdk/accuracy-result-storage/getAccuracyResultStorage.js"; +import { AccuracyRunStatus } from "../../tests/accuracy/sdk/accuracy-result-storage/resultStorage.js"; +import { getCommitSHA } from "../../tests/accuracy/sdk/gitInfo.js"; const envAccuracyRunId = process.env.MDB_ACCURACY_RUN_ID; const envAccuracyRunStatus = process.env.MDB_ACCURACY_RUN_STATUS; diff --git a/tests/accuracy/aggregate.test.ts b/tests/accuracy/aggregate.test.ts index 379459b2..9e8ad13c 100644 --- a/tests/accuracy/aggregate.test.ts +++ b/tests/accuracy/aggregate.test.ts @@ -1,4 +1,4 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; describeAccuracyTests([ { diff --git a/tests/accuracy/collectionIndexes.test.ts b/tests/accuracy/collectionIndexes.test.ts index 0d4ca17a..de8306a6 100644 --- a/tests/accuracy/collectionIndexes.test.ts +++ b/tests/accuracy/collectionIndexes.test.ts @@ -1,5 +1,5 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; function callsCollectionIndexes(prompt: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/collectionSchema.test.ts b/tests/accuracy/collectionSchema.test.ts index bf371f0b..36890d76 100644 --- a/tests/accuracy/collectionSchema.test.ts +++ b/tests/accuracy/collectionSchema.test.ts @@ -1,5 +1,5 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; function callsCollectionSchema(prompt: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/collectionStorageSize.test.ts b/tests/accuracy/collectionStorageSize.test.ts index 387eccb4..8180341e 100644 --- a/tests/accuracy/collectionStorageSize.test.ts +++ b/tests/accuracy/collectionStorageSize.test.ts @@ -1,4 +1,4 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; describeAccuracyTests([ { diff --git a/tests/accuracy/count.test.ts b/tests/accuracy/count.test.ts index 24cd64fc..5fa9a473 100644 --- a/tests/accuracy/count.test.ts +++ b/tests/accuracy/count.test.ts @@ -1,5 +1,5 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; function callsCountToolWithEmptyQuery(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig { return { diff --git a/tests/accuracy/createCollection.test.ts b/tests/accuracy/createCollection.test.ts index 684f353c..6efb1606 100644 --- a/tests/accuracy/createCollection.test.ts +++ b/tests/accuracy/createCollection.test.ts @@ -1,6 +1,6 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-result-storage/result-storage.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; +import { ExpectedToolCall } from "./sdk/accuracyResultStorage/resultStorage.js"; function callsCreateCollection(prompt: string, database: string, collection: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/createIndex.test.ts b/tests/accuracy/createIndex.test.ts index 7868fe22..79d226f6 100644 --- a/tests/accuracy/createIndex.test.ts +++ b/tests/accuracy/createIndex.test.ts @@ -1,5 +1,5 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; function callsCreateIndex(prompt: string, indexKeys: Record): AccuracyTestConfig { return { diff --git a/tests/accuracy/dbStats.test.ts b/tests/accuracy/dbStats.test.ts index 5c599e24..86abb6d8 100644 --- a/tests/accuracy/dbStats.test.ts +++ b/tests/accuracy/dbStats.test.ts @@ -1,5 +1,5 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; function callsListDatabases(prompt: string, database = "mflix"): AccuracyTestConfig { return { diff --git a/tests/accuracy/deleteMany.test.ts b/tests/accuracy/deleteMany.test.ts index 21296a07..61b94669 100644 --- a/tests/accuracy/deleteMany.test.ts +++ b/tests/accuracy/deleteMany.test.ts @@ -1,5 +1,5 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; function callsDeleteManyWithEmptyFilters(prompt: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/dropCollection.test.ts b/tests/accuracy/dropCollection.test.ts index 5c96defe..3f7dd3dd 100644 --- a/tests/accuracy/dropCollection.test.ts +++ b/tests/accuracy/dropCollection.test.ts @@ -1,6 +1,6 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-result-storage/result-storage.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; +import { ExpectedToolCall } from "./sdk/accuracyResultStorage/resultStorage.js"; function onlyCallsDropCollection(prompt: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/dropDatabase.test.ts b/tests/accuracy/dropDatabase.test.ts index c7690a77..d9fb1757 100644 --- a/tests/accuracy/dropDatabase.test.ts +++ b/tests/accuracy/dropDatabase.test.ts @@ -1,6 +1,6 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-result-storage/result-storage.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; +import { ExpectedToolCall } from "./sdk/accuracyResultStorage/resultStorage.js"; function onlyCallsDropDatabase(prompt: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/explain.test.ts b/tests/accuracy/explain.test.ts index 4b58046c..3189b493 100644 --- a/tests/accuracy/explain.test.ts +++ b/tests/accuracy/explain.test.ts @@ -1,5 +1,5 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; function callsExplain(prompt: string, method: Record): AccuracyTestConfig { return { diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts index 125afb67..ea6eadd0 100644 --- a/tests/accuracy/find.test.ts +++ b/tests/accuracy/find.test.ts @@ -1,5 +1,5 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; function callsFindNoFilter(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig { return { diff --git a/tests/accuracy/insertMany.test.ts b/tests/accuracy/insertMany.test.ts index c7dc238f..3116bc62 100644 --- a/tests/accuracy/insertMany.test.ts +++ b/tests/accuracy/insertMany.test.ts @@ -1,5 +1,5 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; function callsInsertMany(prompt: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/listCollections.test.ts b/tests/accuracy/listCollections.test.ts index 4e8a7f0a..829e8712 100644 --- a/tests/accuracy/listCollections.test.ts +++ b/tests/accuracy/listCollections.test.ts @@ -1,5 +1,5 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; function callsListCollections(prompt: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/listDatabases.test.ts b/tests/accuracy/listDatabases.test.ts index 935a0c0b..8944a83d 100644 --- a/tests/accuracy/listDatabases.test.ts +++ b/tests/accuracy/listDatabases.test.ts @@ -1,5 +1,5 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; function callsListDatabases(prompt: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/logs.test.ts b/tests/accuracy/logs.test.ts index 1cf3c24e..f3e1f242 100644 --- a/tests/accuracy/logs.test.ts +++ b/tests/accuracy/logs.test.ts @@ -1,6 +1,6 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-result-storage/result-storage.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; +import { ExpectedToolCall } from "./sdk/accuracyResultStorage/resultStorage.js"; function callsLogsTool(prompt: string, toolCall: ExpectedToolCall): AccuracyTestConfig { return { diff --git a/tests/accuracy/renameCollection.test.ts b/tests/accuracy/renameCollection.test.ts index b4ba88fc..ba160f20 100644 --- a/tests/accuracy/renameCollection.test.ts +++ b/tests/accuracy/renameCollection.test.ts @@ -1,5 +1,5 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; function callsRenameCollection(prompt: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts b/tests/accuracy/sdk/accuracyResultStorage/diskStorage.ts similarity index 99% rename from tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts rename to tests/accuracy/sdk/accuracyResultStorage/diskStorage.ts index 51be8c7f..03aba702 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts +++ b/tests/accuracy/sdk/accuracyResultStorage/diskStorage.ts @@ -9,7 +9,7 @@ import { AccuracyRunStatuses, ExpectedToolCall, ModelResponse, -} from "./result-storage.js"; +} from "./resultStorage.js"; export class DiskBasedResultStorage implements AccuracyResultStorage { async getAccuracyResult(commitSHA: string, runId?: string): Promise { diff --git a/tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.ts b/tests/accuracy/sdk/accuracyResultStorage/getAccuracyResultStorage.ts similarity index 69% rename from tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.ts rename to tests/accuracy/sdk/accuracyResultStorage/getAccuracyResultStorage.ts index 82475bff..127fc5f1 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.ts +++ b/tests/accuracy/sdk/accuracyResultStorage/getAccuracyResultStorage.ts @@ -1,6 +1,6 @@ -import { DiskBasedResultStorage } from "./disk-storage.js"; -import { MongoDBBasedResultStorage } from "./mongodb-storage.js"; -import { AccuracyResultStorage } from "./result-storage.js"; +import { DiskBasedResultStorage } from "./diskStorage.js"; +import { MongoDBBasedResultStorage } from "./mongodbStorage.js"; +import { AccuracyResultStorage } from "./resultStorage.js"; export function getAccuracyResultStorage(): AccuracyResultStorage { const { MDB_ACCURACY_MDB_URL, MDB_ACCURACY_MDB_DB, MDB_ACCURACY_MDB_COLLECTION } = process.env; diff --git a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts b/tests/accuracy/sdk/accuracyResultStorage/mongodbStorage.ts similarity index 99% rename from tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts rename to tests/accuracy/sdk/accuracyResultStorage/mongodbStorage.ts index 91ba8c61..463a54f6 100644 --- a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts +++ b/tests/accuracy/sdk/accuracyResultStorage/mongodbStorage.ts @@ -6,7 +6,7 @@ import { AccuracyRunStatuses, ExpectedToolCall, ModelResponse, -} from "./result-storage.js"; +} from "./resultStorage.js"; // Omitting these as they might contain large chunk of texts const OMITTED_MODEL_RESPONSE_FIELDS: (keyof ModelResponse)[] = ["messages", "text"]; diff --git a/tests/accuracy/sdk/accuracy-result-storage/result-storage.ts b/tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts similarity index 100% rename from tests/accuracy/sdk/accuracy-result-storage/result-storage.ts rename to tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts diff --git a/tests/accuracy/sdk/accuracy-scorer.ts b/tests/accuracy/sdk/accuracyScorer.ts similarity index 97% rename from tests/accuracy/sdk/accuracy-scorer.ts rename to tests/accuracy/sdk/accuracyScorer.ts index 261f48dc..a013d34f 100644 --- a/tests/accuracy/sdk/accuracy-scorer.ts +++ b/tests/accuracy/sdk/accuracyScorer.ts @@ -1,5 +1,5 @@ import diff from "microdiff"; -import { ExpectedToolCall, LLMToolCall } from "./accuracy-result-storage/result-storage.js"; +import { ExpectedToolCall, LLMToolCall } from "./accuracyResultStorage/resultStorage.js"; /** * Tool calling accuracy is a single number calculated based on two dimensions. diff --git a/tests/accuracy/sdk/accuracy-testing-client.ts b/tests/accuracy/sdk/accuracyTestingClient.ts similarity index 98% rename from tests/accuracy/sdk/accuracy-testing-client.ts rename to tests/accuracy/sdk/accuracyTestingClient.ts index e9f83877..e07a5146 100644 --- a/tests/accuracy/sdk/accuracy-testing-client.ts +++ b/tests/accuracy/sdk/accuracyTestingClient.ts @@ -4,7 +4,7 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; import { MCP_SERVER_CLI_SCRIPT } from "./constants.js"; -import { LLMToolCall } from "./accuracy-result-storage/result-storage.js"; +import { LLMToolCall } from "./accuracyResultStorage/resultStorage.js"; import { VercelMCPClient, VercelMCPClientTools } from "./agent.js"; type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult | Promise; diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describeAccuracyTests.ts similarity index 93% rename from tests/accuracy/sdk/describe-accuracy-tests.ts rename to tests/accuracy/sdk/describeAccuracyTests.ts index 4f3c9633..a10d46ef 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describeAccuracyTests.ts @@ -1,12 +1,12 @@ import { describe, it, beforeAll, beforeEach, afterAll } from "vitest"; import { getAvailableModels } from "./models.js"; -import { calculateToolCallingAccuracy } from "./accuracy-scorer.js"; +import { calculateToolCallingAccuracy } from "./accuracyScorer.js"; import { getVercelToolCallingAgent, VercelAgent } from "./agent.js"; import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/tools/mongodb/mongodbHelpers.js"; -import { AccuracyTestingClient, MockedTools } from "./accuracy-testing-client.js"; -import { AccuracyResultStorage, ExpectedToolCall } from "./accuracy-result-storage/result-storage.js"; -import { getAccuracyResultStorage } from "./accuracy-result-storage/get-accuracy-result-storage.js"; -import { getCommitSHA } from "./git-info.js"; +import { AccuracyTestingClient, MockedTools } from "./accuracyTestingClient.js"; +import { AccuracyResultStorage, ExpectedToolCall } from "./accuracyResultStorage/resultStorage.js"; +import { getAccuracyResultStorage } from "./accuracyResultStorage/getAccuracyResultStorage.js"; +import { getCommitSHA } from "./gitInfo.js"; export interface AccuracyTestConfig { /** The prompt to be provided to LLM for evaluation. */ diff --git a/tests/accuracy/sdk/git-info.ts b/tests/accuracy/sdk/gitInfo.ts similarity index 50% rename from tests/accuracy/sdk/git-info.ts rename to tests/accuracy/sdk/gitInfo.ts index a0918a6f..03e34a7d 100644 --- a/tests/accuracy/sdk/git-info.ts +++ b/tests/accuracy/sdk/gitInfo.ts @@ -5,8 +5,3 @@ export async function getCommitSHA(): Promise { const lastCommit = commitLogs.latest; return lastCommit?.hash; } - -export async function getMergeBase(targetBranch: string, workBranchOrCommit: string): Promise { - const result = await simpleGit().raw(["merge-base", targetBranch, workBranchOrCommit]); - return result.trim(); -} diff --git a/tests/accuracy/updateMany.test.ts b/tests/accuracy/updateMany.test.ts index 0975a2b9..e9cc2ab7 100644 --- a/tests/accuracy/updateMany.test.ts +++ b/tests/accuracy/updateMany.test.ts @@ -1,5 +1,5 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; function callsUpdateManyWithEmptyFilters(prompt: string): AccuracyTestConfig { return { diff --git a/tests/unit/accuracy-scorer.test.ts b/tests/unit/accuracyScorer.test.ts similarity index 99% rename from tests/unit/accuracy-scorer.test.ts rename to tests/unit/accuracyScorer.test.ts index 8519e492..8992822c 100644 --- a/tests/unit/accuracy-scorer.test.ts +++ b/tests/unit/accuracyScorer.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from "vitest"; -import { calculateToolCallingAccuracy } from "../accuracy/sdk/accuracy-scorer.js"; -import { ExpectedToolCall, LLMToolCall } from "../accuracy/sdk/accuracy-result-storage/result-storage.js"; +import { calculateToolCallingAccuracy } from "../accuracy/sdk/accuracyScorer.js"; +import { ExpectedToolCall, LLMToolCall } from "../accuracy/sdk/accuracyResultStorage/resultStorage.js"; describe("calculateToolCallingAccuracy", () => { describe("edge cases", () => { From a66553bf05990a35ec996661d9cd38e564c3b1c6 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 17 Jul 2025 11:13:45 +0200 Subject: [PATCH 78/91] chore: update accuracy file name per convention --- package.json | 2 +- .../{generate-test-summary.ts => generateTestSummary.ts} | 4 ++-- .../accuracy/{run-accuracy-tests.sh => runAccuracyTests.sh} | 4 ++-- ...date-accuracy-run-status.ts => updateAccuracyRunStatus.ts} | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) rename scripts/accuracy/{generate-test-summary.ts => generateTestSummary.ts} (99%) rename scripts/accuracy/{run-accuracy-tests.sh => runAccuracyTests.sh} (88%) rename scripts/accuracy/{update-accuracy-run-status.ts => updateAccuracyRunStatus.ts} (90%) diff --git a/package.json b/package.json index bd545ff0..d2de4e3e 100644 --- a/package.json +++ b/package.json @@ -31,7 +31,7 @@ "generate": "./scripts/generate.sh", "test": "vitest --project unit-and-integration --coverage", "pretest:accuracy": "npm run build:compile", - "test:accuracy": "sh ./scripts/accuracy/run-accuracy-tests.sh" + "test:accuracy": "sh ./scripts/accuracy/runAccuracyTests.sh" }, "license": "Apache-2.0", "devDependencies": { diff --git a/scripts/accuracy/generate-test-summary.ts b/scripts/accuracy/generateTestSummary.ts similarity index 99% rename from scripts/accuracy/generate-test-summary.ts rename to scripts/accuracy/generateTestSummary.ts index 3f5783d7..6b9092f1 100644 --- a/scripts/accuracy/generate-test-summary.ts +++ b/scripts/accuracy/generateTestSummary.ts @@ -1,13 +1,13 @@ import path from "path"; import { readFile, writeFile, mkdir } from "fs/promises"; -import { getAccuracyResultStorage } from "../../tests/accuracy/sdk/accuracy-result-storage/getAccuracyResultStorage.js"; +import { getAccuracyResultStorage } from "../../tests/accuracy/sdk/accuracyResultStorage/getAccuracyResultStorage.js"; import { AccuracyResult, AccuracyRunStatuses, ExpectedToolCall, LLMToolCall, ModelResponse, -} from "../../tests/accuracy/sdk/accuracy-result-storage/resultStorage.js"; +} from "../../tests/accuracy/sdk/accuracyResultStorage/resultStorage.js"; import { getCommitSHA } from "../../tests/accuracy/sdk/gitInfo.js"; import { HTML_TEST_SUMMARY_FILE, diff --git a/scripts/accuracy/run-accuracy-tests.sh b/scripts/accuracy/runAccuracyTests.sh similarity index 88% rename from scripts/accuracy/run-accuracy-tests.sh rename to scripts/accuracy/runAccuracyTests.sh index c8607de8..312d08a1 100644 --- a/scripts/accuracy/run-accuracy-tests.sh +++ b/scripts/accuracy/runAccuracyTests.sh @@ -37,9 +37,9 @@ TEST_EXIT_CODE=$? # This is necessary when comparing one accuracy run with another as we wouldn't # want to compare against an incomplete run. export MDB_ACCURACY_RUN_STATUS=$([ $TEST_EXIT_CODE -eq 0 ] && echo "done" || echo "failed") -npx tsx scripts/accuracy/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to '$MDB_ACCURACY_RUN_STATUS'" +npx tsx scripts/accuracy/updateAccuracyRunStatus.ts || echo "Warning: Failed to update accuracy run status to '$MDB_ACCURACY_RUN_STATUS'" # This is optional but we do it anyways to generate a readable summary of report. -npx tsx scripts/accuracy/generate-test-summary.ts || echo "Warning: Failed to generate test summary HTML report" +npx tsx scripts/accuracy/generateTestSummary.ts || echo "Warning: Failed to generate test summary HTML report" exit $TEST_EXIT_CODE \ No newline at end of file diff --git a/scripts/accuracy/update-accuracy-run-status.ts b/scripts/accuracy/updateAccuracyRunStatus.ts similarity index 90% rename from scripts/accuracy/update-accuracy-run-status.ts rename to scripts/accuracy/updateAccuracyRunStatus.ts index ae5544be..59608707 100644 --- a/scripts/accuracy/update-accuracy-run-status.ts +++ b/scripts/accuracy/updateAccuracyRunStatus.ts @@ -1,5 +1,5 @@ -import { getAccuracyResultStorage } from "../../tests/accuracy/sdk/accuracy-result-storage/getAccuracyResultStorage.js"; -import { AccuracyRunStatus } from "../../tests/accuracy/sdk/accuracy-result-storage/resultStorage.js"; +import { getAccuracyResultStorage } from "../../tests/accuracy/sdk/accuracyResultStorage/getAccuracyResultStorage.js"; +import { AccuracyRunStatus } from "../../tests/accuracy/sdk/accuracyResultStorage/resultStorage.js"; import { getCommitSHA } from "../../tests/accuracy/sdk/gitInfo.js"; const envAccuracyRunId = process.env.MDB_ACCURACY_RUN_ID; From ab9961372531bc476c01c3cba4f7c9ac1d274721 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 17 Jul 2025 12:16:57 +0200 Subject: [PATCH 79/91] chore: move test config out of functions --- tests/accuracy/collectionIndexes.test.ts | 41 +++++--- tests/accuracy/collectionSchema.test.ts | 26 +++-- tests/accuracy/count.test.ts | 60 +++++------ tests/accuracy/createCollection.test.ts | 50 +++++---- tests/accuracy/dbStats.test.ts | 15 ++- tests/accuracy/deleteMany.test.ts | 37 +++---- tests/accuracy/dropCollection.test.ts | 111 ++++++++++---------- tests/accuracy/dropDatabase.test.ts | 58 +++++------ tests/accuracy/explain.test.ts | 110 +++++++++++--------- tests/accuracy/find.test.ts | 124 ++++++++++------------- tests/accuracy/insertMany.test.ts | 33 ++---- tests/accuracy/listCollections.test.ts | 45 ++++---- tests/accuracy/listDatabases.test.ts | 33 ++++-- tests/accuracy/logs.test.ts | 36 ++++--- tests/accuracy/renameCollection.test.ts | 25 ++--- 15 files changed, 403 insertions(+), 401 deletions(-) diff --git a/tests/accuracy/collectionIndexes.test.ts b/tests/accuracy/collectionIndexes.test.ts index de8306a6..5db4de1e 100644 --- a/tests/accuracy/collectionIndexes.test.ts +++ b/tests/accuracy/collectionIndexes.test.ts @@ -1,9 +1,8 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; -function callsCollectionIndexes(prompt: string): AccuracyTestConfig { - return { - prompt: prompt, +describeAccuracyTests([ + { + prompt: "How many indexes do I have in 'mflix.movies' namespace?", expectedToolCalls: [ { toolName: "collection-indexes", @@ -13,13 +12,29 @@ function callsCollectionIndexes(prompt: string): AccuracyTestConfig { }, }, ], - }; -} - -describeAccuracyTests([ - callsCollectionIndexes("How many indexes do I have in 'mflix.movies' namespace?"), - callsCollectionIndexes("List all the indexes in movies collection in mflix database"), - callsCollectionIndexes( - `Is the following query: ${JSON.stringify({ runtime: { $lt: 100 } })} on the namespace 'mflix.movies' indexed?` - ), + }, + { + prompt: "List all the indexes in movies collection in mflix database", + expectedToolCalls: [ + { + toolName: "collection-indexes", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ], + }, + { + prompt: `Is the following query: ${JSON.stringify({ runtime: { $lt: 100 } })} on the namespace 'mflix.movies' indexed?`, + expectedToolCalls: [ + { + toolName: "collection-indexes", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ], + }, ]); diff --git a/tests/accuracy/collectionSchema.test.ts b/tests/accuracy/collectionSchema.test.ts index 36890d76..f3479657 100644 --- a/tests/accuracy/collectionSchema.test.ts +++ b/tests/accuracy/collectionSchema.test.ts @@ -1,9 +1,8 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; -function callsCollectionSchema(prompt: string): AccuracyTestConfig { - return { - prompt: prompt, +describeAccuracyTests([ + { + prompt: "Is there a title field in 'db1.coll1' namespace?", expectedToolCalls: [ { toolName: "collection-schema", @@ -13,10 +12,17 @@ function callsCollectionSchema(prompt: string): AccuracyTestConfig { }, }, ], - }; -} - -describeAccuracyTests([ - callsCollectionSchema("Is there a title field in 'db1.coll1' namespace?"), - callsCollectionSchema("What is the type of value stored in title field in coll1 collection in db1 database?"), + }, + { + prompt: "What is the type of value stored in title field in coll1 collection in db1 database?", + expectedToolCalls: [ + { + toolName: "collection-schema", + parameters: { + database: "db1", + collection: "coll1", + }, + }, + ], + }, ]); diff --git a/tests/accuracy/count.test.ts b/tests/accuracy/count.test.ts index 5fa9a473..95e817ad 100644 --- a/tests/accuracy/count.test.ts +++ b/tests/accuracy/count.test.ts @@ -1,53 +1,41 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; -function callsCountToolWithEmptyQuery(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig { - return { - prompt: prompt, +describeAccuracyTests([ + { + prompt: "Count number of documents in 'mflix.movies' namespace.", expectedToolCalls: [ { toolName: "count", parameters: { - database, - collection, + database: "mflix", + collection: "movies", }, }, ], - }; -} - -function callsCountToolWithQuery( - prompt: string, - database = "mflix", - collection = "movies", - query: Record = {} -): AccuracyTestConfig { - return { - prompt: prompt, + }, + { + prompt: "How many documents are there in 'characters' collection in 'comics' database?", expectedToolCalls: [ { toolName: "count", parameters: { - database, - collection, - query, + database: "comics", + collection: "characters", }, }, ], - }; -} - -describeAccuracyTests([ - callsCountToolWithEmptyQuery("Count number of documents in 'mflix.movies' namespace."), - callsCountToolWithEmptyQuery( - "How many documents are there in 'characters' collection in 'comics' database?", - "comics", - "characters" - ), - callsCountToolWithQuery( - "Count all the documents in 'mflix.movies' namespace with runtime less than 100?", - "mflix", - "movies", - { runtime: { $lt: 100 } } - ), + }, + { + prompt: "Count all the documents in 'mflix.movies' namespace with runtime less than 100?", + expectedToolCalls: [ + { + toolName: "count", + parameters: { + database: "mflix", + collection: "movies", + query: { runtime: { $lt: 100 } }, + }, + }, + ], + }, ]); diff --git a/tests/accuracy/createCollection.test.ts b/tests/accuracy/createCollection.test.ts index 6efb1606..75c32e01 100644 --- a/tests/accuracy/createCollection.test.ts +++ b/tests/accuracy/createCollection.test.ts @@ -1,37 +1,33 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; -import { ExpectedToolCall } from "./sdk/accuracyResultStorage/resultStorage.js"; -function callsCreateCollection(prompt: string, database: string, collection: string): AccuracyTestConfig { - return { - prompt: prompt, +describeAccuracyTests([ + { + prompt: "Create a new namespace 'mflix.documentaries'", expectedToolCalls: [ { toolName: "create-collection", parameters: { - database, - collection, + database: "mflix", + collection: "documentaries", }, }, ], - }; -} - -function callsCreateCollectionWithListCollections(prompt: string, expectedToolCalls: ExpectedToolCall[]) { - return { - injectConnectedAssumption: true, - prompt: prompt, - mockedTools: {}, - expectedToolCalls, - }; -} - -describeAccuracyTests([ - callsCreateCollection("Create a new namespace 'mflix.documentaries'", "mflix", "documentaries"), - callsCreateCollection("Create a new collection villains in comics database", "comics", "villains"), - callsCreateCollectionWithListCollections( - "If and only if, the namespace 'mflix.documentaries' does not exist, then create it", - [ + }, + { + prompt: "Create a new collection villains in comics database", + expectedToolCalls: [ + { + toolName: "create-collection", + parameters: { + database: "comics", + collection: "villains", + }, + }, + ], + }, + { + prompt: "If and only if, the namespace 'mflix.documentaries' does not exist, then create it", + expectedToolCalls: [ { toolName: "list-collections", parameters: { @@ -45,6 +41,6 @@ describeAccuracyTests([ collection: "documentaries", }, }, - ] - ), + ], + }, ]); diff --git a/tests/accuracy/dbStats.test.ts b/tests/accuracy/dbStats.test.ts index 86abb6d8..f32d3495 100644 --- a/tests/accuracy/dbStats.test.ts +++ b/tests/accuracy/dbStats.test.ts @@ -1,18 +1,15 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; -function callsListDatabases(prompt: string, database = "mflix"): AccuracyTestConfig { - return { - prompt: prompt, +describeAccuracyTests([ + { + prompt: "What is the size occupied by database mflix?", expectedToolCalls: [ { toolName: "db-stats", parameters: { - database, + database: "mflix", }, }, ], - }; -} - -describeAccuracyTests([callsListDatabases("What is the size occupied by database mflix?")]); + }, +]); diff --git a/tests/accuracy/deleteMany.test.ts b/tests/accuracy/deleteMany.test.ts index 61b94669..0963ca56 100644 --- a/tests/accuracy/deleteMany.test.ts +++ b/tests/accuracy/deleteMany.test.ts @@ -1,9 +1,8 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; -function callsDeleteManyWithEmptyFilters(prompt: string): AccuracyTestConfig { - return { - prompt: prompt, +describeAccuracyTests([ + { + prompt: "Delete all the documents from 'mflix.movies' namespace", expectedToolCalls: [ { toolName: "delete-many", @@ -13,27 +12,29 @@ function callsDeleteManyWithEmptyFilters(prompt: string): AccuracyTestConfig { }, }, ], - }; -} - -function callsDeleteManyWithFilters(prompt: string): AccuracyTestConfig { - return { - prompt: prompt, + }, + { + prompt: "Purge the collection 'movies' in database 'mflix'", expectedToolCalls: [ { toolName: "delete-many", parameters: { database: "mflix", collection: "movies", - filter: { runtime: { $lt: 100 } }, }, }, ], - }; -} - -describeAccuracyTests([ - callsDeleteManyWithEmptyFilters("Delete all the documents from 'mflix.movies' namespace"), - callsDeleteManyWithEmptyFilters("Purge the collection 'movies' in database 'mflix'"), - callsDeleteManyWithFilters("Remove all the documents from namespace 'mflix.movies' where runtime is less than 100"), + }, + { + prompt: "Remove all the documents from namespace 'mflix.movies' where runtime is less than 100", + expectedToolCalls: [ + { + toolName: "delete-many", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ], + }, ]); diff --git a/tests/accuracy/dropCollection.test.ts b/tests/accuracy/dropCollection.test.ts index 3f7dd3dd..77fe06b8 100644 --- a/tests/accuracy/dropCollection.test.ts +++ b/tests/accuracy/dropCollection.test.ts @@ -1,10 +1,8 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; -import { ExpectedToolCall } from "./sdk/accuracyResultStorage/resultStorage.js"; -function onlyCallsDropCollection(prompt: string): AccuracyTestConfig { - return { - prompt: prompt, +describeAccuracyTests([ + { + prompt: "Remove mflix.movies namespace from my cluster.", expectedToolCalls: [ { toolName: "drop-collection", @@ -14,60 +12,63 @@ function onlyCallsDropCollection(prompt: string): AccuracyTestConfig { }, }, ], - }; -} - -function callsDropCollection(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { - return { - prompt: prompt, - expectedToolCalls, - }; -} - -describeAccuracyTests([ - onlyCallsDropCollection("Remove mflix.movies namespace from my cluster."), - onlyCallsDropCollection("Drop movies collection from mflix database."), - callsDropCollection("Remove books collection from which ever database contains it.", [ - { - toolName: "list-databases", - parameters: {}, - }, - { - toolName: "list-collections", - parameters: { - database: "admin", + }, + { + prompt: "Drop movies collection from mflix database.", + expectedToolCalls: [ + { + toolName: "drop-collection", + parameters: { + database: "mflix", + collection: "movies", + }, }, - }, - { - toolName: "list-collections", - parameters: { - database: "comics", + ], + }, + { + prompt: "Remove books collection from which ever database contains it.", + expectedToolCalls: [ + { + toolName: "list-databases", + parameters: {}, }, - }, - { - toolName: "list-collections", - parameters: { - database: "config", + { + toolName: "list-collections", + parameters: { + database: "admin", + }, }, - }, - { - toolName: "list-collections", - parameters: { - database: "local", + { + toolName: "list-collections", + parameters: { + database: "comics", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "config", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "local", + }, }, - }, - { - toolName: "list-collections", - parameters: { - database: "mflix", + { + toolName: "list-collections", + parameters: { + database: "mflix", + }, }, - }, - { - toolName: "drop-collection", - parameters: { - database: "comics", - collection: "books", + { + toolName: "drop-collection", + parameters: { + database: "comics", + collection: "books", + }, }, - }, - ]), + ], + }, ]); diff --git a/tests/accuracy/dropDatabase.test.ts b/tests/accuracy/dropDatabase.test.ts index d9fb1757..3010e83a 100644 --- a/tests/accuracy/dropDatabase.test.ts +++ b/tests/accuracy/dropDatabase.test.ts @@ -1,10 +1,8 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; -import { ExpectedToolCall } from "./sdk/accuracyResultStorage/resultStorage.js"; -function onlyCallsDropDatabase(prompt: string): AccuracyTestConfig { - return { - prompt: prompt, +describeAccuracyTests([ + { + prompt: "Remove mflix database from my cluster.", expectedToolCalls: [ { toolName: "drop-database", @@ -13,29 +11,31 @@ function onlyCallsDropDatabase(prompt: string): AccuracyTestConfig { }, }, ], - }; -} - -function callsDropDatabase(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { - return { - prompt: prompt, - expectedToolCalls, - }; -} - -describeAccuracyTests([ - onlyCallsDropDatabase("Remove mflix database from my cluster."), - onlyCallsDropDatabase("Drop database named mflix."), - callsDropDatabase("If there is a mflix database in my cluster then drop it.", [ - { - toolName: "list-databases", - parameters: {}, - }, - { - toolName: "drop-database", - parameters: { - database: "mflix", + }, + { + prompt: "Drop database named mflix.", + expectedToolCalls: [ + { + toolName: "drop-database", + parameters: { + database: "mflix", + }, }, - }, - ]), + ], + }, + { + prompt: "If there is a mflix database in my cluster then drop it.", + expectedToolCalls: [ + { + toolName: "list-databases", + parameters: {}, + }, + { + toolName: "drop-database", + parameters: { + database: "mflix", + }, + }, + ], + }, ]); diff --git a/tests/accuracy/explain.test.ts b/tests/accuracy/explain.test.ts index 3189b493..e6ffd6f3 100644 --- a/tests/accuracy/explain.test.ts +++ b/tests/accuracy/explain.test.ts @@ -1,63 +1,73 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; -function callsExplain(prompt: string, method: Record): AccuracyTestConfig { - return { - prompt: prompt, +/** + * None of these tests score a parameter match on any of the models, likely + * because we are using Zod.union, when we probably should've used + * Zod.discriminatedUnion + */ +describeAccuracyTests([ + { + prompt: `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?`, expectedToolCalls: [ { toolName: "explain", parameters: { database: "mflix", collection: "movies", - method: [method], + method: [ + { + name: "find", + arguments: { + filter: { release_year: 2020 }, + }, + }, + ], }, }, ], - }; -} - -const callsExplainWithFind = (prompt: string) => - callsExplain(prompt, { - name: "find", - arguments: { - filter: { release_year: 2020 }, - }, - }); - -const callsExplainWithAggregate = (prompt: string) => - callsExplain(prompt, { - name: "aggregate", - arguments: { - pipeline: [ - { - $match: { release_year: 2020 }, + }, + { + prompt: `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?`, + expectedToolCalls: [ + { + toolName: "explain", + parameters: { + database: "mflix", + collection: "movies", + method: [ + { + name: "aggregate", + arguments: { + pipeline: [ + { + $match: { release_year: 2020 }, + }, + ], + }, + }, + ], }, - ], - }, - }); - -const callsExplainWithCount = (prompt: string) => - callsExplain(prompt, { - name: "count", - arguments: { - query: { release_year: 2020 }, - }, - }); - -/** - * None of these tests score a parameter match on any of the models, likely - * because we are using Zod.union, when we probably should've used - * Zod.discriminatedUnion - */ -describeAccuracyTests([ - callsExplainWithFind( - `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` - ), - callsExplainWithAggregate( - `Will aggregating documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` - ), - callsExplainWithCount( - `Will counting documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` - ), + }, + ], + }, + { + prompt: `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?`, + expectedToolCalls: [ + { + toolName: "explain", + parameters: { + database: "mflix", + collection: "movies", + method: [ + { + name: "count", + arguments: { + query: { release_year: 2020 }, + }, + }, + ], + }, + }, + ], + }, ]); diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts index ea6eadd0..24a0f5e0 100644 --- a/tests/accuracy/find.test.ts +++ b/tests/accuracy/find.test.ts @@ -1,116 +1,100 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; -function callsFindNoFilter(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig { - return { - prompt: prompt, +describeAccuracyTests([ + { + prompt: "List all the movies in 'mflix.movies' namespace.", expectedToolCalls: [ { toolName: "find", parameters: { - database, - collection, + database: "mflix", + collection: "movies", }, }, ], - }; -} - -function callsFindWithFilter(prompt: string, filter: Record): AccuracyTestConfig { - return { - prompt: prompt, + }, + { + prompt: "List all the documents in 'comics.books' namespace.", + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "comics", + collection: "books", + }, + }, + ], + }, + { + prompt: "Find all the movies in 'mflix.movies' namespace with runtime less than 100.", expectedToolCalls: [ { toolName: "find", parameters: { database: "mflix", collection: "movies", - filter: filter, + filter: { + runtime: { $lt: 100 }, + }, }, }, ], - }; -} - -function callsFindWithProjection(prompt: string, projection: Record): AccuracyTestConfig { - return { - prompt: prompt, + }, + { + prompt: "Find all the movies in 'mflix.movies' namespace with runtime less than 100.", expectedToolCalls: [ { toolName: "find", parameters: { database: "mflix", collection: "movies", - projection, + filter: { + director: "Christina Collins", + }, }, }, ], - }; -} - -function callsFindWithProjectionAndFilters( - prompt: string, - filter: Record, - projection: Record -): AccuracyTestConfig { - return { - prompt: prompt, + }, + { + prompt: "Give me all the movie titles available in 'mflix.movies' namespace", expectedToolCalls: [ { toolName: "find", parameters: { database: "mflix", collection: "movies", - filter, - projection, + projection: { title: 1 }, }, }, ], - }; -} - -function callsFindWithFilterSortAndLimit( - prompt: string, - filter: Record, - sort: Record, - limit: number -): AccuracyTestConfig { - return { - prompt: prompt, + }, + { + prompt: "Use 'mflix.movies' namespace to answer who were casted in the movie 'Certain Fish'", expectedToolCalls: [ { toolName: "find", parameters: { database: "mflix", collection: "movies", - filter, - sort, - limit, + filter: { title: "Certain Fish" }, + projection: { cast: 1 }, }, }, ], - }; -} - -describeAccuracyTests([ - callsFindNoFilter("List all the movies in 'mflix.movies' namespace."), - callsFindNoFilter("List all the documents in 'comics.books' namespace.", "comics", "books"), - callsFindWithFilter("Find all the movies in 'mflix.movies' namespace with runtime less than 100.", { - runtime: { $lt: 100 }, - }), - callsFindWithFilter("Find all movies in 'mflix.movies' collection where director is 'Christina Collins'", { - director: "Christina Collins", - }), - callsFindWithProjection("Give me all the movie titles available in 'mflix.movies' namespace", { title: 1 }), - callsFindWithProjectionAndFilters( - "Use 'mflix.movies' namespace to answer who were casted in the movie 'Certain Fish'", - { title: "Certain Fish" }, - { cast: 1 } - ), - callsFindWithFilterSortAndLimit( - "From the mflix.movies namespace, give me first 2 movies of Horror genre sorted ascending by their runtime", - { genres: "Horror" }, - { runtime: 1 }, - 2 - ), + }, + { + prompt: "Use 'mflix.movies' namespace to answer who were casted in the movie 'Certain Fish'", + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: { title: "Certain Fish" }, + sort: { runtime: 1 }, + limit: 2, + }, + }, + ], + }, ]); diff --git a/tests/accuracy/insertMany.test.ts b/tests/accuracy/insertMany.test.ts index 3116bc62..8c777a53 100644 --- a/tests/accuracy/insertMany.test.ts +++ b/tests/accuracy/insertMany.test.ts @@ -1,9 +1,12 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; -function callsInsertMany(prompt: string): AccuracyTestConfig { - return { - prompt: prompt, +describeAccuracyTests([ + { + prompt: [ + "In my namespace 'mflix.movies', insert 3 documents each with the following fields:", + "- id: an incremental number starting from 1", + "- name: a string of format 'name'", + ].join("\n"), expectedToolCalls: [ { toolName: "insert-many", @@ -27,12 +30,9 @@ function callsInsertMany(prompt: string): AccuracyTestConfig { }, }, ], - }; -} - -function callsEmptyInsertMany(prompt: string) { - return { - prompt: prompt, + }, + { + prompt: "Add three empty documents in collection 'movies' inside database 'mflix'", expectedToolCalls: [ { toolName: "insert-many", @@ -43,16 +43,5 @@ function callsEmptyInsertMany(prompt: string) { }, }, ], - }; -} - -describeAccuracyTests([ - callsInsertMany( - [ - "In my namespace 'mflix.movies', insert 3 documents each with the following fields:", - "- id: an incremental number starting from 1", - "- name: a string of format 'name'", - ].join("\n") - ), - callsEmptyInsertMany("Add three empty documents in collection 'movies' inside database 'mflix'"), + }, ]); diff --git a/tests/accuracy/listCollections.test.ts b/tests/accuracy/listCollections.test.ts index 829e8712..f3361d80 100644 --- a/tests/accuracy/listCollections.test.ts +++ b/tests/accuracy/listCollections.test.ts @@ -1,23 +1,35 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; -function callsListCollections(prompt: string): AccuracyTestConfig { - return { - prompt: prompt, +describeAccuracyTests([ + { + prompt: "How many collections do I have in database mflix?", expectedToolCalls: [ { toolName: "list-collections", parameters: { database: "mflix" }, }, ], - }; -} - -function callsListDatabasesAndListCollections(prompt: string): AccuracyTestConfig { - return { - injectConnectedAssumption: true, - prompt: prompt, - mockedTools: {}, + }, + { + prompt: "List all the collections in my MongoDB database mflix.", + expectedToolCalls: [ + { + toolName: "list-collections", + parameters: { database: "mflix" }, + }, + ], + }, + { + prompt: "Is there a shows collection in my MongoDB database mflix?", + expectedToolCalls: [ + { + toolName: "list-collections", + parameters: { database: "mflix" }, + }, + ], + }, + { + prompt: "List all the collections that I have in total on my cluster?", expectedToolCalls: [ { toolName: "list-databases", @@ -44,12 +56,5 @@ function callsListDatabasesAndListCollections(prompt: string): AccuracyTestConfi parameters: { database: "mflix" }, }, ], - }; -} - -describeAccuracyTests([ - callsListCollections("How many collections do I have in database mflix?"), - callsListCollections("List all the collections in my MongoDB database mflix."), - callsListCollections("Is there a shows collection in my MongoDB database mflix?"), - callsListDatabasesAndListCollections("List all the collections that I have in total on my cluster?"), + }, ]); diff --git a/tests/accuracy/listDatabases.test.ts b/tests/accuracy/listDatabases.test.ts index 8944a83d..4681fd7c 100644 --- a/tests/accuracy/listDatabases.test.ts +++ b/tests/accuracy/listDatabases.test.ts @@ -1,20 +1,31 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; -function callsListDatabases(prompt: string): AccuracyTestConfig { - return { - prompt: prompt, +describeAccuracyTests([ + { + prompt: "How many databases do I have?", expectedToolCalls: [ { toolName: "list-databases", parameters: {}, }, ], - }; -} - -describeAccuracyTests([ - callsListDatabases("How many databases do I have?"), - callsListDatabases("List all the databases that I have in my clusters"), - callsListDatabases("Is there a mflix database in my cluster?"), + }, + { + prompt: "List all the databases that I have in my clusters", + expectedToolCalls: [ + { + toolName: "list-databases", + parameters: {}, + }, + ], + }, + { + prompt: "Is there a mflix database in my cluster?", + expectedToolCalls: [ + { + toolName: "list-databases", + parameters: {}, + }, + ], + }, ]); diff --git a/tests/accuracy/logs.test.ts b/tests/accuracy/logs.test.ts index f3e1f242..4da398a1 100644 --- a/tests/accuracy/logs.test.ts +++ b/tests/accuracy/logs.test.ts @@ -10,17 +10,27 @@ function callsLogsTool(prompt: string, toolCall: ExpectedToolCall): AccuracyTest } describeAccuracyTests([ - callsLogsTool("Were there any startup warnings for my MongoDB server?", { - toolName: "mongodb-logs", - parameters: { - type: "startupWarnings", - }, - }), - callsLogsTool("Retrieve first 10 logs for my MongoDB server?", { - toolName: "mongodb-logs", - parameters: { - type: "global", - limit: 10, - }, - }), + { + prompt: "Were there any startup warnings for my MongoDB server?", + expectedToolCalls: [ + { + toolName: "mongodb-logs", + parameters: { + type: "startupWarnings", + }, + }, + ], + }, + { + prompt: "Retrieve first 10 logs for my MongoDB server?", + expectedToolCalls: [ + { + toolName: "mongodb-logs", + parameters: { + type: "global", + limit: 10, + }, + }, + ], + }, ]); diff --git a/tests/accuracy/renameCollection.test.ts b/tests/accuracy/renameCollection.test.ts index ba160f20..9b2c9dac 100644 --- a/tests/accuracy/renameCollection.test.ts +++ b/tests/accuracy/renameCollection.test.ts @@ -1,9 +1,8 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; -function callsRenameCollection(prompt: string): AccuracyTestConfig { - return { - prompt: prompt, +describeAccuracyTests([ + { + prompt: "Rename my 'mflix.movies' namespace to 'mflix.new_movies'", expectedToolCalls: [ { toolName: "rename-collection", @@ -14,12 +13,9 @@ function callsRenameCollection(prompt: string): AccuracyTestConfig { }, }, ], - }; -} - -function callsRenameCollectionWithDropTarget(prompt: string): AccuracyTestConfig { - return { - prompt: prompt, + }, + { + prompt: "Rename my 'mflix.movies' namespace to 'mflix.new_movies' while removing the old namespace.", expectedToolCalls: [ { toolName: "rename-collection", @@ -31,12 +27,5 @@ function callsRenameCollectionWithDropTarget(prompt: string): AccuracyTestConfig }, }, ], - }; -} - -describeAccuracyTests([ - callsRenameCollection("Rename my 'mflix.movies' namespace to 'mflix.new_movies'"), - callsRenameCollectionWithDropTarget( - "Rename my 'mflix.movies' namespace to 'mflix.new_movies' while removing the old namespace." - ), + }, ]); From 093ebcf072eeef55675484fdf02009b7da76d6e5 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 17 Jul 2025 13:21:19 +0200 Subject: [PATCH 80/91] chore: move left out test config out of functions --- tests/accuracy/createIndex.test.ts | 40 ++++++++++++++++-------------- tests/accuracy/updateMany.test.ts | 30 ++++++---------------- 2 files changed, 30 insertions(+), 40 deletions(-) diff --git a/tests/accuracy/createIndex.test.ts b/tests/accuracy/createIndex.test.ts index 79d226f6..bce11b18 100644 --- a/tests/accuracy/createIndex.test.ts +++ b/tests/accuracy/createIndex.test.ts @@ -1,30 +1,34 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; -function callsCreateIndex(prompt: string, indexKeys: Record): AccuracyTestConfig { - return { - prompt: prompt, +describeAccuracyTests([ + { + prompt: "Create an index that covers the following query on 'mflix.movies' namespace - { \"release_year\": 1992 }", expectedToolCalls: [ { toolName: "create-index", parameters: { database: "mflix", collection: "movies", - keys: indexKeys, + keys: { + release_year: 1, + }, }, }, ], - }; -} - -describeAccuracyTests([ - callsCreateIndex( - "Create an index that covers the following query on 'mflix.movies' namespace - { \"release_year\": 1992 }", - { - release_year: 1, - } - ), - callsCreateIndex("Create a text index on title field in 'mflix.movies' namespace", { - title: "text", - }), + }, + { + prompt: "Create a text index on title field in 'mflix.movies' namespace", + expectedToolCalls: [ + { + toolName: "create-index", + parameters: { + database: "mflix", + collection: "movies", + keys: { + title: "text", + }, + }, + }, + ], + }, ]); diff --git a/tests/accuracy/updateMany.test.ts b/tests/accuracy/updateMany.test.ts index e9cc2ab7..68576d11 100644 --- a/tests/accuracy/updateMany.test.ts +++ b/tests/accuracy/updateMany.test.ts @@ -1,9 +1,8 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; -function callsUpdateManyWithEmptyFilters(prompt: string): AccuracyTestConfig { - return { - prompt: prompt, +describeAccuracyTests([ + { + prompt: "Update all the documents in 'mflix.movies' namespace with a new field 'new_field' set to 1", expectedToolCalls: [ { toolName: "update-many", @@ -18,19 +17,16 @@ function callsUpdateManyWithEmptyFilters(prompt: string): AccuracyTestConfig { }, }, ], - }; -} - -function callsUpdateManyWithFilters(prompt: string, filter: Record): AccuracyTestConfig { - return { - prompt: prompt, + }, + { + prompt: "Update all the documents in 'mflix.movies' namespace, where runtime is less than 100, with a new field 'new_field' set to 1", expectedToolCalls: [ { toolName: "update-many", parameters: { database: "mflix", collection: "movies", - filter, + filter: { runtime: { $lt: 100 } }, update: { $set: { new_field: 1, @@ -39,15 +35,5 @@ function callsUpdateManyWithFilters(prompt: string, filter: Record Date: Thu, 17 Jul 2025 13:29:37 +0200 Subject: [PATCH 81/91] chore: remove unused func --- tests/accuracy/logs.test.ts | 9 --------- 1 file changed, 9 deletions(-) diff --git a/tests/accuracy/logs.test.ts b/tests/accuracy/logs.test.ts index 4da398a1..907ea4d2 100644 --- a/tests/accuracy/logs.test.ts +++ b/tests/accuracy/logs.test.ts @@ -1,13 +1,4 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { AccuracyTestConfig } from "./sdk/describeAccuracyTests.js"; -import { ExpectedToolCall } from "./sdk/accuracyResultStorage/resultStorage.js"; - -function callsLogsTool(prompt: string, toolCall: ExpectedToolCall): AccuracyTestConfig { - return { - prompt: prompt, - expectedToolCalls: [toolCall], - }; -} describeAccuracyTests([ { From 4bbcba168d313994e914d669b1e194d39feeb4c3 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 17 Jul 2025 13:47:29 +0200 Subject: [PATCH 82/91] chore: remove orphan checks --- tests/accuracy/sdk/accuracyScorer.ts | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/accuracy/sdk/accuracyScorer.ts b/tests/accuracy/sdk/accuracyScorer.ts index a013d34f..73dd3f5b 100644 --- a/tests/accuracy/sdk/accuracyScorer.ts +++ b/tests/accuracy/sdk/accuracyScorer.ts @@ -97,16 +97,12 @@ export function calculateToolCallingAccuracy( function compareParams(expected: Record, actual: Record): number { const differences = diff(expected, actual); - if (differences.length === 0) { return 1; } const hasOnlyAdditions = differences.every((d) => d.type === "CREATE"); - const hasRemovals = differences.some((d) => d.type === "REMOVE"); - const hasChanges = differences.some((d) => d.type === "CHANGE"); - - if (hasOnlyAdditions && !hasRemovals && !hasChanges) { + if (hasOnlyAdditions) { return 0.75; } From 7c3061de34bc21e44cfbd0003741b68ef6dce460 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 17 Jul 2025 15:43:56 +0200 Subject: [PATCH 83/91] chore: update the test prompt --- tests/accuracy/find.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts index 24a0f5e0..667bd966 100644 --- a/tests/accuracy/find.test.ts +++ b/tests/accuracy/find.test.ts @@ -83,7 +83,7 @@ describeAccuracyTests([ ], }, { - prompt: "Use 'mflix.movies' namespace to answer who were casted in the movie 'Certain Fish'", + prompt: "From the mflix.movies namespace, give me first 2 movies of Horror genre sorted ascending by their runtime", expectedToolCalls: [ { toolName: "find", From ec52ee51c1b40736f9d8c2751f1830698fbd1e5b Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Fri, 18 Jul 2025 03:19:31 +0200 Subject: [PATCH 84/91] chore: allow adding custom parameter scorers In some cases, find, aggregate, count, explain, deleteMany, etc we need to grade extra provided arguments depending on the prompt itself. Sometimes additional parameters are fine and sometimes they are not. For example: increasing the keys in filter might lead to a different result hence if any such thing happens, we should grade the accuracy as 0 and not 0.75. To suppor this use-case, this commit introduces the idea of a custom scorer that could be plugged in to accuracy scorer to provided more controlled accuracy grading. Additionally this commit reverts the default behaviour of handling added parameters. Earlier we were marking newly added parameters as hallucinations and hence grading 0.75. But now, after figuring out that most of our tools don't even expect extra parameters, we are flipping the switch and instead will now grade 0 when additional parameters are specified, unless there is a scorer provided to handle the custom scoring logic. --- tests/accuracy/aggregate.test.ts | 13 ++- tests/accuracy/collectionSchema.test.ts | 12 +-- tests/accuracy/count.test.ts | 36 +++++--- tests/accuracy/createIndex.test.ts | 31 ++++--- tests/accuracy/deleteMany.test.ts | 35 +++++--- tests/accuracy/explain.test.ts | 89 +++++++++++--------- tests/accuracy/find.test.ts | 100 +++++++++++++--------- tests/accuracy/sdk/accuracyScorer.ts | 12 ++- tests/accuracy/sdk/parameterScorer.ts | 59 +++++++++++++ tests/accuracy/updateMany.test.ts | 37 +++++---- tests/unit/accuracyScorer.test.ts | 105 +++++++++++++++++++++++- tests/unit/parameterScorer.test.ts | 87 ++++++++++++++++++++ 12 files changed, 475 insertions(+), 141 deletions(-) create mode 100644 tests/accuracy/sdk/parameterScorer.ts create mode 100644 tests/unit/parameterScorer.test.ts diff --git a/tests/accuracy/aggregate.test.ts b/tests/accuracy/aggregate.test.ts index 9e8ad13c..9b3050bd 100644 --- a/tests/accuracy/aggregate.test.ts +++ b/tests/accuracy/aggregate.test.ts @@ -1,4 +1,5 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { ParameterScorers, withParameterScorer } from "./sdk/parameterScorer.js"; describeAccuracyTests([ { @@ -6,9 +7,15 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "aggregate", - parameters: { - pipeline: { $group: { _id: "$release_year", count: { $sum: 1 } } }, - }, + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + pipeline: [{ $group: { _id: "$release_year", count: { $sum: 1 } } }], + }, + // There should not be a $match at all hence the custom matcher + ParameterScorers.noAdditionsAllowedForPaths(["pipeline.0.$match"]) + ), }, ], }, diff --git a/tests/accuracy/collectionSchema.test.ts b/tests/accuracy/collectionSchema.test.ts index f3479657..8c9039bd 100644 --- a/tests/accuracy/collectionSchema.test.ts +++ b/tests/accuracy/collectionSchema.test.ts @@ -2,25 +2,25 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; describeAccuracyTests([ { - prompt: "Is there a title field in 'db1.coll1' namespace?", + prompt: "Is there a title field in 'mflix.movies' namespace?", expectedToolCalls: [ { toolName: "collection-schema", parameters: { - database: "db1", - collection: "coll1", + database: "mflix", + collection: "movies", }, }, ], }, { - prompt: "What is the type of value stored in title field in coll1 collection in db1 database?", + prompt: "What is the type of value stored in title field in movies collection in mflix database?", expectedToolCalls: [ { toolName: "collection-schema", parameters: { - database: "db1", - collection: "coll1", + database: "mflix", + collection: "movies", }, }, ], diff --git a/tests/accuracy/count.test.ts b/tests/accuracy/count.test.ts index 95e817ad..d8d57785 100644 --- a/tests/accuracy/count.test.ts +++ b/tests/accuracy/count.test.ts @@ -1,4 +1,5 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { ParameterScorers, withParameterScorer } from "./sdk/parameterScorer.js"; describeAccuracyTests([ { @@ -6,10 +7,13 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "count", - parameters: { - database: "mflix", - collection: "movies", - }, + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + }, + ParameterScorers.emptyAdditionsAllowedForPaths(["query"]) + ), }, ], }, @@ -18,10 +22,13 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "count", - parameters: { - database: "comics", - collection: "characters", - }, + parameters: withParameterScorer( + { + database: "comics", + collection: "characters", + }, + ParameterScorers.emptyAdditionsAllowedForPaths(["query"]) + ), }, ], }, @@ -30,11 +37,14 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "count", - parameters: { - database: "mflix", - collection: "movies", - query: { runtime: { $lt: 100 } }, - }, + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + query: { runtime: { $lt: 100 } }, + }, + ParameterScorers.noAdditionsAllowedForPaths(["query"]) + ), }, ], }, diff --git a/tests/accuracy/createIndex.test.ts b/tests/accuracy/createIndex.test.ts index bce11b18..4b0b61ce 100644 --- a/tests/accuracy/createIndex.test.ts +++ b/tests/accuracy/createIndex.test.ts @@ -1,4 +1,5 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { ParameterScorers, withParameterScorer } from "./sdk/parameterScorer.js"; describeAccuracyTests([ { @@ -6,13 +7,16 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "create-index", - parameters: { - database: "mflix", - collection: "movies", - keys: { - release_year: 1, + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + keys: { + release_year: 1, + }, }, - }, + ParameterScorers.noAdditionsAllowedForPaths(["keys"]) + ), }, ], }, @@ -21,13 +25,16 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "create-index", - parameters: { - database: "mflix", - collection: "movies", - keys: { - title: "text", + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + keys: { + title: "text", + }, }, - }, + ParameterScorers.noAdditionsAllowedForPaths(["keys"]) + ), }, ], }, diff --git a/tests/accuracy/deleteMany.test.ts b/tests/accuracy/deleteMany.test.ts index 0963ca56..be507f6e 100644 --- a/tests/accuracy/deleteMany.test.ts +++ b/tests/accuracy/deleteMany.test.ts @@ -1,4 +1,5 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { ParameterScorers, withParameterScorer } from "./sdk/parameterScorer.js"; describeAccuracyTests([ { @@ -6,10 +7,13 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "delete-many", - parameters: { - database: "mflix", - collection: "movies", - }, + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + }, + ParameterScorers.emptyAdditionsAllowedForPaths(["filter"]) + ), }, ], }, @@ -18,10 +22,13 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "delete-many", - parameters: { - database: "mflix", - collection: "movies", - }, + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + }, + ParameterScorers.emptyAdditionsAllowedForPaths(["filter"]) + ), }, ], }, @@ -30,10 +37,14 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "delete-many", - parameters: { - database: "mflix", - collection: "movies", - }, + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 } }, + }, + ParameterScorers.noAdditionsAllowedForPaths(["filter"]) + ), }, ], }, diff --git a/tests/accuracy/explain.test.ts b/tests/accuracy/explain.test.ts index e6ffd6f3..ec1f8002 100644 --- a/tests/accuracy/explain.test.ts +++ b/tests/accuracy/explain.test.ts @@ -1,4 +1,5 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { ParameterScorers, withParameterScorer } from "./sdk/parameterScorer.js"; /** * None of these tests score a parameter match on any of the models, likely @@ -11,62 +12,72 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "explain", - parameters: { - database: "mflix", - collection: "movies", - method: [ - { - name: "find", - arguments: { - filter: { release_year: 2020 }, + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + method: [ + { + name: "find", + arguments: { + filter: { release_year: 2020 }, + }, }, - }, - ], - }, + ], + }, + // Any addition to method itself will essentially change the explain output + ParameterScorers.noAdditionsAllowedForPaths(["method"]) + ), }, ], }, { - prompt: `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?`, + prompt: `Will aggregating documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?`, expectedToolCalls: [ { toolName: "explain", - parameters: { - database: "mflix", - collection: "movies", - method: [ - { - name: "aggregate", - arguments: { - pipeline: [ - { - $match: { release_year: 2020 }, - }, - ], + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + method: [ + { + name: "aggregate", + arguments: { + pipeline: [ + { + $match: { release_year: 2020 }, + }, + ], + }, }, - }, - ], - }, + ], + }, + ParameterScorers.noAdditionsAllowedForPaths(["method"]) + ), }, ], }, { - prompt: `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?`, + prompt: `Will counting documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?`, expectedToolCalls: [ { toolName: "explain", - parameters: { - database: "mflix", - collection: "movies", - method: [ - { - name: "count", - arguments: { - query: { release_year: 2020 }, + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + method: [ + { + name: "count", + arguments: { + query: { release_year: 2020 }, + }, }, - }, - ], - }, + ], + }, + ParameterScorers.noAdditionsAllowedForPaths(["method"]) + ), }, ], }, diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts index 667bd966..b275abe4 100644 --- a/tests/accuracy/find.test.ts +++ b/tests/accuracy/find.test.ts @@ -1,4 +1,5 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { withParameterScorer, ParameterScorers } from "./sdk/parameterScorer.js"; describeAccuracyTests([ { @@ -6,10 +7,13 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "find", - parameters: { - database: "mflix", - collection: "movies", - }, + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + }, + ParameterScorers.emptyAdditionsAllowedForPaths(["filter"]) + ), }, ], }, @@ -18,10 +22,13 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "find", - parameters: { - database: "comics", - collection: "books", - }, + parameters: withParameterScorer( + { + database: "comics", + collection: "books", + }, + ParameterScorers.emptyAdditionsAllowedForPaths(["filter"]) + ), }, ], }, @@ -30,28 +37,34 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "find", - parameters: { - database: "mflix", - collection: "movies", - filter: { - runtime: { $lt: 100 }, + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + filter: { + runtime: { $lt: 100 }, + }, }, - }, + ParameterScorers.noAdditionsAllowedForPaths(["filter"]) + ), }, ], }, { - prompt: "Find all the movies in 'mflix.movies' namespace with runtime less than 100.", + prompt: "Find all movies in 'mflix.movies' collection where director is 'Christina Collins'", expectedToolCalls: [ { toolName: "find", - parameters: { - database: "mflix", - collection: "movies", - filter: { - director: "Christina Collins", + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + filter: { + director: "Christina Collins", + }, }, - }, + ParameterScorers.noAdditionsAllowedForPaths(["filter"]) + ), }, ], }, @@ -60,11 +73,14 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "find", - parameters: { - database: "mflix", - collection: "movies", - projection: { title: 1 }, - }, + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + projection: { title: 1 }, + }, + ParameterScorers.emptyAdditionsAllowedForPaths(["filter"]) + ), }, ], }, @@ -73,12 +89,15 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "find", - parameters: { - database: "mflix", - collection: "movies", - filter: { title: "Certain Fish" }, - projection: { cast: 1 }, - }, + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + filter: { title: "Certain Fish" }, + projection: { cast: 1 }, + }, + ParameterScorers.noAdditionsAllowedForPaths(["filter"]) + ), }, ], }, @@ -87,13 +106,16 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "find", - parameters: { - database: "mflix", - collection: "movies", - filter: { title: "Certain Fish" }, - sort: { runtime: 1 }, - limit: 2, - }, + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + filter: { genres: "Horror" }, + sort: { runtime: 1 }, + limit: 2, + }, + ParameterScorers.noAdditionsAllowedForPaths(["filter"]) + ), }, ], }, diff --git a/tests/accuracy/sdk/accuracyScorer.ts b/tests/accuracy/sdk/accuracyScorer.ts index 73dd3f5b..29b79bd1 100644 --- a/tests/accuracy/sdk/accuracyScorer.ts +++ b/tests/accuracy/sdk/accuracyScorer.ts @@ -1,5 +1,6 @@ import diff from "microdiff"; import { ExpectedToolCall, LLMToolCall } from "./accuracyResultStorage/resultStorage.js"; +import { PARAMETER_SCORER_SYMBOL, ParametersWithScorer } from "./parameterScorer.js"; /** * Tool calling accuracy is a single number calculated based on two dimensions. @@ -103,7 +104,16 @@ function compareParams(expected: Record, actual: Record d.type === "CREATE"); if (hasOnlyAdditions) { - return 0.75; + const expectedWithScorer = expected as ParametersWithScorer; + const customScorer = expectedWithScorer[PARAMETER_SCORER_SYMBOL]; + + // Most of our tools don't expect additional parameters to be passed so + // any additional parameter by default will be graded as 0. + if (!customScorer) { + return 0; + } + + return customScorer(differences); } return 0; diff --git a/tests/accuracy/sdk/parameterScorer.ts b/tests/accuracy/sdk/parameterScorer.ts new file mode 100644 index 00000000..f016b3d8 --- /dev/null +++ b/tests/accuracy/sdk/parameterScorer.ts @@ -0,0 +1,59 @@ +import { DifferenceCreate } from "microdiff"; + +export const PARAMETER_SCORER_SYMBOL = Symbol("parameterScorer"); + +export type AdditionalParameterScorer = (additions: DifferenceCreate[]) => number; +export interface ParametersWithScorer { + [PARAMETER_SCORER_SYMBOL]?: AdditionalParameterScorer; +} + +export function withParameterScorer>( + parameters: T, + scorer: AdditionalParameterScorer +): Record & ParametersWithScorer { + const result = { ...parameters } as Record & ParametersWithScorer; + result[PARAMETER_SCORER_SYMBOL] = scorer; + return result; +} + +function pathComponentsToFullPaths(pathComponents: (string | number)[]): string[] { + return pathComponents.reduce((fullPaths, pathComponent) => { + if (!fullPaths.length) { + return [pathComponent.toString()]; + } + return [...fullPaths, `${fullPaths.pop()}.${pathComponent}`]; + }, []); +} + +export const ParameterScorers = { + noAdditionsAllowedForPaths: (paths: string[]): AdditionalParameterScorer => { + return (additions: DifferenceCreate[]): number => { + const hasCriticalAddition = additions.some((diff) => { + // In case of nested objects / arrays the diff.path could have multiple entries + const diffPaths = pathComponentsToFullPaths(diff.path); + return diffPaths.some((diffPath) => paths.includes(diffPath)); + }); + return hasCriticalAddition ? 0 : 0.75; + }; + }, + emptyAdditionsAllowedForPaths: (paths: string[]): AdditionalParameterScorer => { + return (additions: DifferenceCreate[]): number => { + const hasNonEmptyAdditions = additions.some((diff) => { + const diffPaths = pathComponentsToFullPaths(diff.path); + const considerablePathHasAdditions = diffPaths.some((diffPath) => paths.includes(diffPath)); + const valueAtPath = diff.value; + return ( + considerablePathHasAdditions && + !( + valueAtPath === null || + valueAtPath === undefined || + (typeof valueAtPath === "object" && Object.keys(valueAtPath).length === 0) || + (Array.isArray(valueAtPath) && !valueAtPath.length) + ) + ); + }); + + return hasNonEmptyAdditions ? 0 : 0.75; + }; + }, +} as const; diff --git a/tests/accuracy/updateMany.test.ts b/tests/accuracy/updateMany.test.ts index 68576d11..ca0638b8 100644 --- a/tests/accuracy/updateMany.test.ts +++ b/tests/accuracy/updateMany.test.ts @@ -1,4 +1,5 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { ParameterScorers, withParameterScorer } from "./sdk/parameterScorer.js"; describeAccuracyTests([ { @@ -6,15 +7,18 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "update-many", - parameters: { - database: "mflix", - collection: "movies", - update: { - $set: { - new_field: 1, + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + update: { + $set: { + new_field: 1, + }, }, }, - }, + ParameterScorers.noAdditionsAllowedForPaths(["update"]) + ), }, ], }, @@ -23,16 +27,19 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "update-many", - parameters: { - database: "mflix", - collection: "movies", - filter: { runtime: { $lt: 100 } }, - update: { - $set: { - new_field: 1, + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 } }, + update: { + $set: { + new_field: 1, + }, }, }, - }, + ParameterScorers.noAdditionsAllowedForPaths(["filter", "update"]) + ), }, ], }, diff --git a/tests/unit/accuracyScorer.test.ts b/tests/unit/accuracyScorer.test.ts index 8992822c..54682b14 100644 --- a/tests/unit/accuracyScorer.test.ts +++ b/tests/unit/accuracyScorer.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it } from "vitest"; import { calculateToolCallingAccuracy } from "../accuracy/sdk/accuracyScorer.js"; import { ExpectedToolCall, LLMToolCall } from "../accuracy/sdk/accuracyResultStorage/resultStorage.js"; +import { withParameterScorer, ParameterScorers } from "../accuracy/sdk/parameterScorer.js"; describe("calculateToolCallingAccuracy", () => { describe("edge cases", () => { @@ -69,7 +70,7 @@ describe("calculateToolCallingAccuracy", () => { }); describe("additional parameters", () => { - it("should return 0.75 when tool call has additional nested parameters", () => { + it("should return 0 when tool call has additional nested parameters (default behavior)", () => { const expected: ExpectedToolCall[] = [ { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, ]; @@ -86,6 +87,108 @@ describe("calculateToolCallingAccuracy", () => { }, ]; const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); + }); + + it("should return 0.75 when expected has no filter but actual has empty filter", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: withParameterScorer( + { database: "mflix", collection: "movies" }, + ParameterScorers.emptyAdditionsAllowedForPaths(["filter"]) + ), + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { database: "mflix", collection: "movies", filter: {} }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0.75); + }); + + it("should return 0 when expected has no filter but actual has non-empty filter", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: withParameterScorer( + { database: "mflix", collection: "movies" }, + ParameterScorers.emptyAdditionsAllowedForPaths(["filter"]) + ), + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { database: "mflix", collection: "movies", filter: { genre: "Horror" } }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); + }); + + it("should return 0 when using noAdditionsAllowedForPaths and filter is modified", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 } }, + }, + ParameterScorers.noAdditionsAllowedForPaths(["filter"]) + ), + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 }, genre: "Horror" }, + }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); + }); + + it("should return 0.75 when using noAdditionsAllowedForPaths and non-critical parameters are added", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: withParameterScorer( + { + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 } }, + }, + ParameterScorers.noAdditionsAllowedForPaths(["filter"]) + ), + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 } }, + limit: 10, + sort: { title: 1 }, + }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); expect(result).toBe(0.75); }); }); diff --git a/tests/unit/parameterScorer.test.ts b/tests/unit/parameterScorer.test.ts new file mode 100644 index 00000000..57426d86 --- /dev/null +++ b/tests/unit/parameterScorer.test.ts @@ -0,0 +1,87 @@ +import { describe, expect, it } from "vitest"; +import { DifferenceCreate } from "microdiff"; +import { ParameterScorers, withParameterScorer, PARAMETER_SCORER_SYMBOL } from "../accuracy/sdk/parameterScorer.js"; + +describe("ParameterScorers", () => { + describe("noAdditionsAllowedForPaths", () => { + const scorer = ParameterScorers.noAdditionsAllowedForPaths(["filter", "query"]); + + it("should return 0.75 when no additions are made", () => { + const additions: DifferenceCreate[] = []; + expect(scorer(additions)).toBe(0.75); + }); + + it("should return 0.75 when additions are made to non-protected paths", () => { + const additions: DifferenceCreate[] = [ + { type: "CREATE", path: ["limit"], value: 10 }, + { type: "CREATE", path: ["sort"], value: { name: 1 } }, + ]; + expect(scorer(additions)).toBe(0.75); + }); + + it("should return 0 when additions are made to protected top-level paths", () => { + const additions: DifferenceCreate[] = [{ type: "CREATE", path: ["filter"], value: { name: "test" } }]; + expect(scorer(additions)).toBe(0); + }); + + it("should return 0 when additions are made to deeply nested protected paths", () => { + const additions: DifferenceCreate[] = [{ type: "CREATE", path: ["filter", "age", "$gte"], value: 18 }]; + expect(scorer(additions)).toBe(0); + }); + + it("should handle array indices in paths correctly", () => { + const additions: DifferenceCreate[] = [{ type: "CREATE", path: ["filter", "tags", 0], value: "new-tag" }]; + expect(scorer(additions)).toBe(0); + }); + }); + + describe("emptyAdditionsAllowedForPaths", () => { + const scorer = ParameterScorers.emptyAdditionsAllowedForPaths(["filter", "options"]); + + it("should return 0.75 when no additions are made", () => { + const additions: DifferenceCreate[] = []; + expect(scorer(additions)).toBe(0.75); + }); + + it("should return 0.75 when empty object is added to protected path", () => { + const additions: DifferenceCreate[] = [{ type: "CREATE", path: ["filter"], value: {} }]; + expect(scorer(additions)).toBe(0.75); + }); + + it("should return 0.75 when null is added to protected path", () => { + const additions: DifferenceCreate[] = [{ type: "CREATE", path: ["filter"], value: null }]; + expect(scorer(additions)).toBe(0.75); + }); + + it("should return 0.75 when additions are made to non-protected paths", () => { + const additions: DifferenceCreate[] = [ + { type: "CREATE", path: ["limit"], value: 10 }, + { type: "CREATE", path: ["sort"], value: { name: 1 } }, + ]; + expect(scorer(additions)).toBe(0.75); + }); + + it("should return 0 when non-empty object is added to protected path", () => { + const additions: DifferenceCreate[] = [{ type: "CREATE", path: ["filter"], value: { name: "test" } }]; + expect(scorer(additions)).toBe(0); + }); + + it("should return 0 when non-empty additions are made to nested protected paths", () => { + const additions: DifferenceCreate[] = [{ type: "CREATE", path: ["filter", "name"], value: "test" }]; + expect(scorer(additions)).toBe(0); + }); + }); +}); + +describe("withParameterScorer", () => { + it("should attach scorer to parameters object", () => { + const params = { database: "test", collection: "users" }; + const scorer = ParameterScorers.noAdditionsAllowedForPaths(["filter"]); + + const result = withParameterScorer(params, scorer); + + expect(result.database).toBe("test"); + expect(result.collection).toBe("users"); + expect(result[PARAMETER_SCORER_SYMBOL]).toBe(scorer); + }); +}); From 743cbfac409b85416ad35000721f2d357e18598d Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Fri, 18 Jul 2025 03:25:55 +0200 Subject: [PATCH 85/91] chore: ts fixes --- tests/accuracy/sdk/parameterScorer.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/accuracy/sdk/parameterScorer.ts b/tests/accuracy/sdk/parameterScorer.ts index f016b3d8..879de262 100644 --- a/tests/accuracy/sdk/parameterScorer.ts +++ b/tests/accuracy/sdk/parameterScorer.ts @@ -41,7 +41,7 @@ export const ParameterScorers = { const hasNonEmptyAdditions = additions.some((diff) => { const diffPaths = pathComponentsToFullPaths(diff.path); const considerablePathHasAdditions = diffPaths.some((diffPath) => paths.includes(diffPath)); - const valueAtPath = diff.value; + const valueAtPath = diff.value as unknown; return ( considerablePathHasAdditions && !( From 3491a3b43e3989b626f49787526939cc208940ba Mon Sep 17 00:00:00 2001 From: Nikola Irinchev Date: Fri, 18 Jul 2025 11:13:51 +0200 Subject: [PATCH 86/91] fix: tweak the arg shapes to improve tool accuracy (#381) --- src/tools/mongodb/metadata/explain.ts | 2 +- src/tools/mongodb/read/aggregate.ts | 2 +- src/tools/mongodb/read/count.ts | 3 ++- src/tools/mongodb/read/find.ts | 13 +++++++++---- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/tools/mongodb/metadata/explain.ts b/src/tools/mongodb/metadata/explain.ts index a686d9cc..ae9eb822 100644 --- a/src/tools/mongodb/metadata/explain.ts +++ b/src/tools/mongodb/metadata/explain.ts @@ -16,7 +16,7 @@ export class ExplainTool extends MongoDBToolBase { ...DbOperationArgs, method: z .array( - z.union([ + z.discriminatedUnion("name", [ z.object({ name: z.literal("aggregate"), arguments: z.object(AggregateArgs), diff --git a/src/tools/mongodb/read/aggregate.ts b/src/tools/mongodb/read/aggregate.ts index f9868dba..b74dd786 100644 --- a/src/tools/mongodb/read/aggregate.ts +++ b/src/tools/mongodb/read/aggregate.ts @@ -6,7 +6,7 @@ import { EJSON } from "bson"; import { checkIndexUsage } from "../../../helpers/indexCheck.js"; export const AggregateArgs = { - pipeline: z.array(z.record(z.string(), z.unknown())).describe("An array of aggregation stages to execute"), + pipeline: z.array(z.object({}).passthrough()).describe("An array of aggregation stages to execute"), }; export class AggregateTool extends MongoDBToolBase { diff --git a/src/tools/mongodb/read/count.ts b/src/tools/mongodb/read/count.ts index df3664b5..5f5f44c0 100644 --- a/src/tools/mongodb/read/count.ts +++ b/src/tools/mongodb/read/count.ts @@ -6,7 +6,8 @@ import { checkIndexUsage } from "../../../helpers/indexCheck.js"; export const CountArgs = { query: z - .record(z.string(), z.unknown()) + .object({}) + .passthrough() .optional() .describe( "A filter/query parameter. Allows users to filter the documents to count. Matches the syntax of the filter argument of db.collection.count()." diff --git a/src/tools/mongodb/read/find.ts b/src/tools/mongodb/read/find.ts index 02c337ed..0649e62d 100644 --- a/src/tools/mongodb/read/find.ts +++ b/src/tools/mongodb/read/find.ts @@ -8,18 +8,23 @@ import { checkIndexUsage } from "../../../helpers/indexCheck.js"; export const FindArgs = { filter: z - .record(z.string(), z.unknown()) + .object({}) + .passthrough() .optional() .describe("The query filter, matching the syntax of the query argument of db.collection.find()"), projection: z - .record(z.string(), z.unknown()) + .object({}) + .passthrough() .optional() .describe("The projection, matching the syntax of the projection argument of db.collection.find()"), limit: z.number().optional().default(10).describe("The maximum number of documents to return"), sort: z - .record(z.string(), z.custom()) + .object({}) + .catchall(z.custom()) .optional() - .describe("A document, describing the sort order, matching the syntax of the sort argument of cursor.sort()"), + .describe( + "A document, describing the sort order, matching the syntax of the sort argument of cursor.sort(). The keys of the object are the fields to sort on, while the values are the sort directions (1 for ascending, -1 for descending)." + ), }; export class FindTool extends MongoDBToolBase { From 2909e8a0882ed6fdaa8a7705c082a80b373022c4 Mon Sep 17 00:00:00 2001 From: Nikola Irinchev Date: Fri, 18 Jul 2025 13:26:38 +0200 Subject: [PATCH 87/91] Replace the matcher framework --- src/tools/mongodb/create/createIndex.ts | 2 +- src/tools/mongodb/create/insertMany.ts | 2 +- src/tools/mongodb/delete/deleteMany.ts | 3 +- src/tools/mongodb/update/updateMany.ts | 6 +- tests/accuracy/aggregate.test.ts | 25 ++-- tests/accuracy/count.test.ts | 39 ++--- tests/accuracy/createIndex.test.ts | 34 ++--- tests/accuracy/deleteMany.test.ts | 39 ++--- tests/accuracy/explain.test.ts | 85 +++++------ tests/accuracy/find.test.ts | 108 +++++++------- tests/accuracy/sdk/accuracyScorer.ts | 43 +----- tests/accuracy/sdk/matcher.ts | 191 ++++++++++++++++++++++++ tests/accuracy/sdk/parameterScorer.ts | 59 -------- tests/accuracy/updateMany.test.ts | 40 +++-- tests/unit/accuracyScorer.test.ts | 157 +++++++++++++++---- tests/unit/parameterScorer.test.ts | 87 ----------- 16 files changed, 499 insertions(+), 421 deletions(-) create mode 100644 tests/accuracy/sdk/matcher.ts delete mode 100644 tests/accuracy/sdk/parameterScorer.ts delete mode 100644 tests/unit/parameterScorer.test.ts diff --git a/src/tools/mongodb/create/createIndex.ts b/src/tools/mongodb/create/createIndex.ts index 8e393f04..c050c9aa 100644 --- a/src/tools/mongodb/create/createIndex.ts +++ b/src/tools/mongodb/create/createIndex.ts @@ -9,7 +9,7 @@ export class CreateIndexTool extends MongoDBToolBase { protected description = "Create an index for a collection"; protected argsShape = { ...DbOperationArgs, - keys: z.record(z.string(), z.custom()).describe("The index definition"), + keys: z.object({}).catchall(z.custom()).describe("The index definition"), name: z.string().optional().describe("The name of the index"), }; diff --git a/src/tools/mongodb/create/insertMany.ts b/src/tools/mongodb/create/insertMany.ts index 4744e344..25ecba17 100644 --- a/src/tools/mongodb/create/insertMany.ts +++ b/src/tools/mongodb/create/insertMany.ts @@ -9,7 +9,7 @@ export class InsertManyTool extends MongoDBToolBase { protected argsShape = { ...DbOperationArgs, documents: z - .array(z.record(z.string(), z.unknown()).describe("An individual MongoDB document")) + .array(z.object({}).passthrough().describe("An individual MongoDB document")) .describe( "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany()" ), diff --git a/src/tools/mongodb/delete/deleteMany.ts b/src/tools/mongodb/delete/deleteMany.ts index aa135512..8440a25c 100644 --- a/src/tools/mongodb/delete/deleteMany.ts +++ b/src/tools/mongodb/delete/deleteMany.ts @@ -10,7 +10,8 @@ export class DeleteManyTool extends MongoDBToolBase { protected argsShape = { ...DbOperationArgs, filter: z - .record(z.string(), z.unknown()) + .object({}) + .passthrough() .optional() .describe( "The query filter, specifying the deletion criteria. Matches the syntax of the filter argument of db.collection.deleteMany()" diff --git a/src/tools/mongodb/update/updateMany.ts b/src/tools/mongodb/update/updateMany.ts index b31a843e..49dd2099 100644 --- a/src/tools/mongodb/update/updateMany.ts +++ b/src/tools/mongodb/update/updateMany.ts @@ -10,13 +10,15 @@ export class UpdateManyTool extends MongoDBToolBase { protected argsShape = { ...DbOperationArgs, filter: z - .record(z.string(), z.unknown()) + .object({}) + .passthrough() .optional() .describe( "The selection criteria for the update, matching the syntax of the filter argument of db.collection.updateOne()" ), update: z - .record(z.string(), z.unknown()) + .object({}) + .passthrough() .describe("An update document describing the modifications to apply using update operator expressions"), upsert: z .boolean() diff --git a/tests/accuracy/aggregate.test.ts b/tests/accuracy/aggregate.test.ts index 9b3050bd..f42be7ed 100644 --- a/tests/accuracy/aggregate.test.ts +++ b/tests/accuracy/aggregate.test.ts @@ -1,5 +1,5 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { ParameterScorers, withParameterScorer } from "./sdk/parameterScorer.js"; +import { Matcher } from "./sdk/matcher.js"; describeAccuracyTests([ { @@ -7,15 +7,20 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "aggregate", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - pipeline: [{ $group: { _id: "$release_year", count: { $sum: 1 } } }], - }, - // There should not be a $match at all hence the custom matcher - ParameterScorers.noAdditionsAllowedForPaths(["pipeline.0.$match"]) - ), + parameters: { + database: "mflix", + collection: "movies", + pipeline: [ + { $group: { _id: "$release_year", count: { $sum: 1 } } }, + // For the sake of accuracy, we allow any sort order + Matcher.composite( + Matcher.undefined, + Matcher.unknown({ + $sort: Matcher.anyValue, + }) + ), + ], + }, }, ], }, diff --git a/tests/accuracy/count.test.ts b/tests/accuracy/count.test.ts index d8d57785..7716aa65 100644 --- a/tests/accuracy/count.test.ts +++ b/tests/accuracy/count.test.ts @@ -1,5 +1,5 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { ParameterScorers, withParameterScorer } from "./sdk/parameterScorer.js"; +import { Matcher } from "./sdk/matcher.js"; describeAccuracyTests([ { @@ -7,13 +7,11 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "count", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - }, - ParameterScorers.emptyAdditionsAllowedForPaths(["query"]) - ), + parameters: { + database: "mflix", + collection: "movies", + query: Matcher.emptyObjectOrUndefined, + }, }, ], }, @@ -22,13 +20,11 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "count", - parameters: withParameterScorer( - { - database: "comics", - collection: "characters", - }, - ParameterScorers.emptyAdditionsAllowedForPaths(["query"]) - ), + parameters: { + database: "comics", + collection: "characters", + query: Matcher.emptyObjectOrUndefined, + }, }, ], }, @@ -37,14 +33,11 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "count", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - query: { runtime: { $lt: 100 } }, - }, - ParameterScorers.noAdditionsAllowedForPaths(["query"]) - ), + parameters: { + database: "mflix", + collection: "movies", + query: { runtime: { $lt: 100 } }, + }, }, ], }, diff --git a/tests/accuracy/createIndex.test.ts b/tests/accuracy/createIndex.test.ts index 4b0b61ce..e162c6fa 100644 --- a/tests/accuracy/createIndex.test.ts +++ b/tests/accuracy/createIndex.test.ts @@ -1,5 +1,5 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { ParameterScorers, withParameterScorer } from "./sdk/parameterScorer.js"; +import { Matcher } from "./sdk/matcher.js"; describeAccuracyTests([ { @@ -7,16 +7,14 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "create-index", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - keys: { - release_year: 1, - }, + parameters: { + database: "mflix", + collection: "movies", + name: Matcher.composite(Matcher.undefined, Matcher.string()), + keys: { + release_year: 1, }, - ParameterScorers.noAdditionsAllowedForPaths(["keys"]) - ), + }, }, ], }, @@ -25,16 +23,14 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "create-index", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - keys: { - title: "text", - }, + parameters: { + database: "mflix", + collection: "movies", + name: Matcher.composite(Matcher.undefined, Matcher.string()), + keys: { + title: "text", }, - ParameterScorers.noAdditionsAllowedForPaths(["keys"]) - ), + }, }, ], }, diff --git a/tests/accuracy/deleteMany.test.ts b/tests/accuracy/deleteMany.test.ts index be507f6e..a5ab1f09 100644 --- a/tests/accuracy/deleteMany.test.ts +++ b/tests/accuracy/deleteMany.test.ts @@ -1,5 +1,5 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { ParameterScorers, withParameterScorer } from "./sdk/parameterScorer.js"; +import { Matcher } from "./sdk/matcher.js"; describeAccuracyTests([ { @@ -7,13 +7,11 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "delete-many", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - }, - ParameterScorers.emptyAdditionsAllowedForPaths(["filter"]) - ), + parameters: { + database: "mflix", + collection: "movies", + filter: Matcher.emptyObjectOrUndefined, + }, }, ], }, @@ -22,13 +20,11 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "delete-many", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - }, - ParameterScorers.emptyAdditionsAllowedForPaths(["filter"]) - ), + parameters: { + database: "mflix", + collection: "movies", + filter: Matcher.emptyObjectOrUndefined, + }, }, ], }, @@ -37,14 +33,11 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "delete-many", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - filter: { runtime: { $lt: 100 } }, - }, - ParameterScorers.noAdditionsAllowedForPaths(["filter"]) - ), + parameters: { + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 } }, + }, }, ], }, diff --git a/tests/accuracy/explain.test.ts b/tests/accuracy/explain.test.ts index ec1f8002..cb9ac0c1 100644 --- a/tests/accuracy/explain.test.ts +++ b/tests/accuracy/explain.test.ts @@ -1,5 +1,4 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { ParameterScorers, withParameterScorer } from "./sdk/parameterScorer.js"; /** * None of these tests score a parameter match on any of the models, likely @@ -12,22 +11,18 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "explain", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - method: [ - { - name: "find", - arguments: { - filter: { release_year: 2020 }, - }, + parameters: { + database: "mflix", + collection: "movies", + method: [ + { + name: "find", + arguments: { + filter: { release_year: 2020 }, }, - ], - }, - // Any addition to method itself will essentially change the explain output - ParameterScorers.noAdditionsAllowedForPaths(["method"]) - ), + }, + ], + }, }, ], }, @@ -36,25 +31,22 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "explain", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - method: [ - { - name: "aggregate", - arguments: { - pipeline: [ - { - $match: { release_year: 2020 }, - }, - ], - }, + parameters: { + database: "mflix", + collection: "movies", + method: [ + { + name: "aggregate", + arguments: { + pipeline: [ + { + $match: { release_year: 2020 }, + }, + ], }, - ], - }, - ParameterScorers.noAdditionsAllowedForPaths(["method"]) - ), + }, + ], + }, }, ], }, @@ -63,21 +55,18 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "explain", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - method: [ - { - name: "count", - arguments: { - query: { release_year: 2020 }, - }, + parameters: { + database: "mflix", + collection: "movies", + method: [ + { + name: "count", + arguments: { + query: { release_year: 2020 }, }, - ], - }, - ParameterScorers.noAdditionsAllowedForPaths(["method"]) - ), + }, + ], + }, }, ], }, diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts index b275abe4..1f8b4bc5 100644 --- a/tests/accuracy/find.test.ts +++ b/tests/accuracy/find.test.ts @@ -1,5 +1,5 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { withParameterScorer, ParameterScorers } from "./sdk/parameterScorer.js"; +import { Matcher } from "./sdk/matcher.js"; describeAccuracyTests([ { @@ -7,13 +7,11 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "find", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - }, - ParameterScorers.emptyAdditionsAllowedForPaths(["filter"]) - ), + parameters: { + database: "mflix", + collection: "movies", + filter: Matcher.emptyObjectOrUndefined, + }, }, ], }, @@ -22,13 +20,11 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "find", - parameters: withParameterScorer( - { - database: "comics", - collection: "books", - }, - ParameterScorers.emptyAdditionsAllowedForPaths(["filter"]) - ), + parameters: { + database: "comics", + collection: "books", + filter: Matcher.emptyObjectOrUndefined, + }, }, ], }, @@ -37,16 +33,13 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "find", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - filter: { - runtime: { $lt: 100 }, - }, + parameters: { + database: "mflix", + collection: "movies", + filter: { + runtime: { $lt: 100 }, }, - ParameterScorers.noAdditionsAllowedForPaths(["filter"]) - ), + }, }, ], }, @@ -55,16 +48,13 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "find", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - filter: { - director: "Christina Collins", - }, + parameters: { + database: "mflix", + collection: "movies", + filter: { + director: "Christina Collins", }, - ParameterScorers.noAdditionsAllowedForPaths(["filter"]) - ), + }, }, ], }, @@ -73,14 +63,18 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "find", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - projection: { title: 1 }, + parameters: { + database: "mflix", + collection: "movies", + projection: { + title: 1, + _id: Matcher.composite( + Matcher.undefined, + Matcher.number((value) => value === 0) + ), }, - ParameterScorers.emptyAdditionsAllowedForPaths(["filter"]) - ), + filter: Matcher.emptyObjectOrUndefined, + }, }, ], }, @@ -89,15 +83,16 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "find", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - filter: { title: "Certain Fish" }, - projection: { cast: 1 }, + parameters: { + database: "mflix", + collection: "movies", + filter: { title: "Certain Fish" }, + projection: { + cast: 1, + _id: Matcher.composite(Matcher.undefined, Matcher.number()), }, - ParameterScorers.noAdditionsAllowedForPaths(["filter"]) - ), + limit: Matcher.number((value) => value > 0), + }, }, ], }, @@ -106,16 +101,13 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "find", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - filter: { genres: "Horror" }, - sort: { runtime: 1 }, - limit: 2, - }, - ParameterScorers.noAdditionsAllowedForPaths(["filter"]) - ), + parameters: { + database: "mflix", + collection: "movies", + filter: { genres: "Horror" }, + sort: { runtime: 1 }, + limit: 2, + }, }, ], }, diff --git a/tests/accuracy/sdk/accuracyScorer.ts b/tests/accuracy/sdk/accuracyScorer.ts index 29b79bd1..83dc6a16 100644 --- a/tests/accuracy/sdk/accuracyScorer.ts +++ b/tests/accuracy/sdk/accuracyScorer.ts @@ -1,6 +1,5 @@ -import diff from "microdiff"; import { ExpectedToolCall, LLMToolCall } from "./accuracyResultStorage/resultStorage.js"; -import { PARAMETER_SCORER_SYMBOL, ParametersWithScorer } from "./parameterScorer.js"; +import { Matcher } from "./matcher.js"; /** * Tool calling accuracy is a single number calculated based on two dimensions. @@ -64,9 +63,7 @@ export function calculateToolCallingAccuracy( return actualToolCalls.length === 0 ? 1 : 0.75; } - const maxAccuracy = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1; - - const individualAccuracies: number[] = []; + let currentScore = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1; const checkedActualToolCallIndexes = new Set(); for (const expectedCall of expectedToolCalls) { @@ -78,43 +75,19 @@ export function calculateToolCallingAccuracy( .map(({ call, index }) => ({ call, index, - score: compareParams(expectedCall.parameters, call.parameters), + score: Matcher.unknown(expectedCall.parameters).match(call.parameters), })) .filter(({ score }) => score >= 0.75) .sort((a, b) => b.score - a.score || a.index - b.index); const bestMatch = candidates[0]; - if (!bestMatch) { - individualAccuracies.push(0); - } else { - checkedActualToolCallIndexes.add(bestMatch.index); - const individualAccuracy = Math.min(bestMatch.score, maxAccuracy); - individualAccuracies.push(individualAccuracy); - } - } - - return Math.min(...individualAccuracies); -} - -function compareParams(expected: Record, actual: Record): number { - const differences = diff(expected, actual); - if (differences.length === 0) { - return 1; - } - - const hasOnlyAdditions = differences.every((d) => d.type === "CREATE"); - if (hasOnlyAdditions) { - const expectedWithScorer = expected as ParametersWithScorer; - const customScorer = expectedWithScorer[PARAMETER_SCORER_SYMBOL]; - - // Most of our tools don't expect additional parameters to be passed so - // any additional parameter by default will be graded as 0. - if (!customScorer) { - return 0; + if (!bestMatch || bestMatch.score === 0) { + return 0; // No matching tool call found, return 0 } - return customScorer(differences); + checkedActualToolCallIndexes.add(bestMatch.index); + currentScore = Math.min(currentScore, bestMatch.score); } - return 0; + return currentScore; } diff --git a/tests/accuracy/sdk/matcher.ts b/tests/accuracy/sdk/matcher.ts new file mode 100644 index 00000000..24bf50dc --- /dev/null +++ b/tests/accuracy/sdk/matcher.ts @@ -0,0 +1,191 @@ +const MATCHER_SYMBOL = Symbol("match"); + +export abstract class Matcher { + [MATCHER_SYMBOL] = true; + public abstract match(actual: unknown): number; + + public static get emptyObjectOrUndefined(): Matcher { + return new EmptyObjectOrUndefinedMatcher(); + } + + public static get anyValue(): Matcher { + return new AnyValueMatcher(); + } + + public static number(additionalFilter: (value: number) => boolean = () => true): Matcher { + return new NumberMatcher(additionalFilter); + } + + public static composite(...matchers: Matcher[]): Matcher { + return new CompositeMatcher(matchers); + } + + public static get undefined(): Matcher { + return new UndefinedMatcher(); + } + + public static boolean(expected?: boolean): Matcher { + return new BooleanMatcher(expected); + } + + public static string(): Matcher { + return new StringMatcher(); + } + + public static unknown(expected: unknown): Matcher { + if (typeof expected === "object" && expected !== null && MATCHER_SYMBOL in expected) { + return expected as Matcher; + } + + return new UnknownMatcher(expected); + } +} + +export const PARAMETER_SCORER_SYMBOL = Symbol("parameterScorer"); + +class EmptyObjectOrUndefinedMatcher extends Matcher { + public match(actual: unknown): number { + if ( + actual === undefined || + actual === null || + (typeof actual === "object" && Object.keys(actual).length === 0) + ) { + return 1; // Match if actual is undefined, null, or an empty object + } + + return 0; // No match + } +} + +class AnyValueMatcher extends Matcher { + public match(): number { + return 1; + } +} + +class NumberMatcher extends Matcher { + constructor(private additionalFilter: (value: number) => boolean = () => true) { + super(); + } + public match(actual: unknown): number { + return typeof actual === "number" && this.additionalFilter(actual) ? 1 : 0; + } +} + +class UndefinedMatcher extends Matcher { + public match(actual: unknown): number { + return actual === undefined ? 1 : 0; + } +} + +class CompositeMatcher extends Matcher { + constructor(private matchers: Matcher[]) { + super(); + } + + public match(actual: unknown): number { + let currentScore = 0; + + for (const matcher of this.matchers) { + const score = matcher.match(actual); + if (score === 1) { + return 1; // If one of the matchers is perfect score, return immediately + } + currentScore = Math.max(currentScore, score); + } + + return currentScore; + } +} + +class BooleanMatcher extends Matcher { + constructor(private expected?: boolean) { + super(); + } + + public match(actual: unknown): number { + return typeof actual === "boolean" && (this.expected === undefined || this.expected === actual) ? 1 : 0; + } +} + +class StringMatcher extends Matcher { + public match(actual: unknown): number { + return typeof actual === "string" ? 1 : 0; + } +} + +class UnknownMatcher extends Matcher { + constructor(private expected: unknown) { + super(); + } + + public match(actual: unknown): number { + if (this.expected === actual) { + // If both are the same, just return immediately. + return 1; + } + + if (this.expected === undefined || this.expected === null) { + // We expect null/undefined - return 1 if actual is also null/undefined + return actual === undefined || actual === null ? 1 : 0; + } + + let currentScore = 1; + + if (Array.isArray(this.expected)) { + if (!Array.isArray(actual)) { + // One is an array, the other is not + return 0; + } + + if (actual.length > this.expected.length) { + // Actual array has more elements - this is likely an error (e.g. an aggregation pipeline with extra stages) + // If we want to allow extra elements, we should add matchers to the array + return 0; + } + + for (let i = 0; i < this.expected.length; i++) { + currentScore = Math.min(currentScore, Matcher.unknown(this.expected[i]).match(actual[i])); + if (currentScore === 0) { + // If we already found a mismatch, we can stop early + return 0; + } + } + } else if (typeof this.expected === "object") { + if (MATCHER_SYMBOL in this.expected) { + return (this.expected as Matcher).match(actual); + } + + if (typeof actual !== "object" || actual === null) { + // One is an object, the other is not + return 0; + } + + const expectedKeys = Object.keys(this.expected); + const actualKeys = Object.keys(actual); + + if (actualKeys.length > expectedKeys.length) { + // The model provided more keys than expected - this should not happen. + // If we want to allow some extra keys, we should specify that in the test definition + // by adding matchers for those keys. + return 0; + } + + for (const key of expectedKeys) { + currentScore = Math.min( + currentScore, + Matcher.unknown((this.expected as Record)[key]).match( + (actual as Record)[key] + ) + ); + + if (currentScore === 0) { + // If we already found a mismatch, we can stop early + return 0; + } + } + } + + return currentScore; + } +} diff --git a/tests/accuracy/sdk/parameterScorer.ts b/tests/accuracy/sdk/parameterScorer.ts deleted file mode 100644 index 879de262..00000000 --- a/tests/accuracy/sdk/parameterScorer.ts +++ /dev/null @@ -1,59 +0,0 @@ -import { DifferenceCreate } from "microdiff"; - -export const PARAMETER_SCORER_SYMBOL = Symbol("parameterScorer"); - -export type AdditionalParameterScorer = (additions: DifferenceCreate[]) => number; -export interface ParametersWithScorer { - [PARAMETER_SCORER_SYMBOL]?: AdditionalParameterScorer; -} - -export function withParameterScorer>( - parameters: T, - scorer: AdditionalParameterScorer -): Record & ParametersWithScorer { - const result = { ...parameters } as Record & ParametersWithScorer; - result[PARAMETER_SCORER_SYMBOL] = scorer; - return result; -} - -function pathComponentsToFullPaths(pathComponents: (string | number)[]): string[] { - return pathComponents.reduce((fullPaths, pathComponent) => { - if (!fullPaths.length) { - return [pathComponent.toString()]; - } - return [...fullPaths, `${fullPaths.pop()}.${pathComponent}`]; - }, []); -} - -export const ParameterScorers = { - noAdditionsAllowedForPaths: (paths: string[]): AdditionalParameterScorer => { - return (additions: DifferenceCreate[]): number => { - const hasCriticalAddition = additions.some((diff) => { - // In case of nested objects / arrays the diff.path could have multiple entries - const diffPaths = pathComponentsToFullPaths(diff.path); - return diffPaths.some((diffPath) => paths.includes(diffPath)); - }); - return hasCriticalAddition ? 0 : 0.75; - }; - }, - emptyAdditionsAllowedForPaths: (paths: string[]): AdditionalParameterScorer => { - return (additions: DifferenceCreate[]): number => { - const hasNonEmptyAdditions = additions.some((diff) => { - const diffPaths = pathComponentsToFullPaths(diff.path); - const considerablePathHasAdditions = diffPaths.some((diffPath) => paths.includes(diffPath)); - const valueAtPath = diff.value as unknown; - return ( - considerablePathHasAdditions && - !( - valueAtPath === null || - valueAtPath === undefined || - (typeof valueAtPath === "object" && Object.keys(valueAtPath).length === 0) || - (Array.isArray(valueAtPath) && !valueAtPath.length) - ) - ); - }); - - return hasNonEmptyAdditions ? 0 : 0.75; - }; - }, -} as const; diff --git a/tests/accuracy/updateMany.test.ts b/tests/accuracy/updateMany.test.ts index ca0638b8..66698286 100644 --- a/tests/accuracy/updateMany.test.ts +++ b/tests/accuracy/updateMany.test.ts @@ -1,5 +1,5 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; -import { ParameterScorers, withParameterScorer } from "./sdk/parameterScorer.js"; +import { Matcher } from "./sdk/matcher.js"; describeAccuracyTests([ { @@ -7,18 +7,16 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "update-many", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - update: { - $set: { - new_field: 1, - }, + parameters: { + database: "mflix", + collection: "movies", + update: { + $set: { + new_field: 1, }, }, - ParameterScorers.noAdditionsAllowedForPaths(["update"]) - ), + upsert: Matcher.composite(Matcher.undefined, Matcher.boolean()), + }, }, ], }, @@ -27,19 +25,17 @@ describeAccuracyTests([ expectedToolCalls: [ { toolName: "update-many", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - filter: { runtime: { $lt: 100 } }, - update: { - $set: { - new_field: 1, - }, + parameters: { + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 } }, + update: { + $set: { + new_field: 1, }, }, - ParameterScorers.noAdditionsAllowedForPaths(["filter", "update"]) - ), + upsert: Matcher.composite(Matcher.undefined, Matcher.boolean()), + }, }, ], }, diff --git a/tests/unit/accuracyScorer.test.ts b/tests/unit/accuracyScorer.test.ts index 54682b14..61566c55 100644 --- a/tests/unit/accuracyScorer.test.ts +++ b/tests/unit/accuracyScorer.test.ts @@ -1,7 +1,7 @@ import { describe, expect, it } from "vitest"; import { calculateToolCallingAccuracy } from "../accuracy/sdk/accuracyScorer.js"; import { ExpectedToolCall, LLMToolCall } from "../accuracy/sdk/accuracyResultStorage/resultStorage.js"; -import { withParameterScorer, ParameterScorers } from "../accuracy/sdk/parameterScorer.js"; +import { Matcher } from "../accuracy/sdk/matcher.js"; describe("calculateToolCallingAccuracy", () => { describe("edge cases", () => { @@ -90,60 +90,92 @@ describe("calculateToolCallingAccuracy", () => { expect(result).toBe(0); }); - it("should return 0.75 when expected has no filter but actual has empty filter", () => { + it("should return 1 when expected has no filter but actual has empty filter", () => { const expected: ExpectedToolCall[] = [ { toolName: "find", - parameters: withParameterScorer( - { database: "mflix", collection: "movies" }, - ParameterScorers.emptyAdditionsAllowedForPaths(["filter"]) - ), + parameters: { + database: "mflix", + collection: "movies", + filter: Matcher.emptyObjectOrUndefined, + }, }, ]; const actual: LLMToolCall[] = [ { toolCallId: "1", toolName: "find", - parameters: { database: "mflix", collection: "movies", filter: {} }, + parameters: { + database: "mflix", + collection: "movies", + filter: {}, + }, }, ]; const result = calculateToolCallingAccuracy(expected, actual); - expect(result).toBe(0.75); + expect(result).toBe(1); + }); + + it("should return 1 when expected has no filter and actual has no filter", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: Matcher.emptyObjectOrUndefined, + }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(1); }); it("should return 0 when expected has no filter but actual has non-empty filter", () => { const expected: ExpectedToolCall[] = [ { toolName: "find", - parameters: withParameterScorer( - { database: "mflix", collection: "movies" }, - ParameterScorers.emptyAdditionsAllowedForPaths(["filter"]) - ), + parameters: { + database: "mflix", + collection: "movies", + filter: Matcher.emptyObjectOrUndefined, + }, }, ]; const actual: LLMToolCall[] = [ { toolCallId: "1", toolName: "find", - parameters: { database: "mflix", collection: "movies", filter: { genre: "Horror" } }, + parameters: { + database: "mflix", + collection: "movies", + filter: { genre: "Horror" }, + }, }, ]; const result = calculateToolCallingAccuracy(expected, actual); expect(result).toBe(0); }); - it("should return 0 when using noAdditionsAllowedForPaths and filter is modified", () => { + it("should return 0 when there are additional nested fields", () => { const expected: ExpectedToolCall[] = [ { toolName: "find", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - filter: { runtime: { $lt: 100 } }, - }, - ParameterScorers.noAdditionsAllowedForPaths(["filter"]) - ), + parameters: { + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 } }, + }, }, ]; const actual: LLMToolCall[] = [ @@ -161,18 +193,17 @@ describe("calculateToolCallingAccuracy", () => { expect(result).toBe(0); }); - it("should return 0.75 when using noAdditionsAllowedForPaths and non-critical parameters are added", () => { + it("should return 1 when ignored additional fields are provided", () => { const expected: ExpectedToolCall[] = [ { toolName: "find", - parameters: withParameterScorer( - { - database: "mflix", - collection: "movies", - filter: { runtime: { $lt: 100 } }, - }, - ParameterScorers.noAdditionsAllowedForPaths(["filter"]) - ), + parameters: { + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 } }, + limit: Matcher.number(), + sort: Matcher.anyValue, + }, }, ]; const actual: LLMToolCall[] = [ @@ -189,7 +220,69 @@ describe("calculateToolCallingAccuracy", () => { }, ]; const result = calculateToolCallingAccuracy(expected, actual); - expect(result).toBe(0.75); + expect(result).toBe(1); + }); + + it("should return 1 for array where additional elements are allowed", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "aggregate", + parameters: { + database: "mflix", + collection: "movies", + pipeline: [ + { $match: { genre: "Horror" } }, + Matcher.composite(Matcher.undefined, Matcher.anyValue), + ], + }, + }, + ]; + + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "aggregate", + parameters: { + database: "mflix", + collection: "movies", + pipeline: [{ $match: { genre: "Horror" } }, { $sort: { title: 1 } }], + }, + }, + ]; + + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(1); + }); + + it("should return 1 for array where additional elements are allowed but not provided", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "aggregate", + parameters: { + database: "mflix", + collection: "movies", + pipeline: [ + { $match: { genre: "Horror" } }, + Matcher.composite(Matcher.undefined, Matcher.anyValue), + ], + }, + }, + ]; + + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "aggregate", + parameters: { + database: "mflix", + collection: "movies", + pipeline: [{ $match: { genre: "Horror" } }], + }, + }, + ]; + + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(1); }); }); diff --git a/tests/unit/parameterScorer.test.ts b/tests/unit/parameterScorer.test.ts deleted file mode 100644 index 57426d86..00000000 --- a/tests/unit/parameterScorer.test.ts +++ /dev/null @@ -1,87 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { DifferenceCreate } from "microdiff"; -import { ParameterScorers, withParameterScorer, PARAMETER_SCORER_SYMBOL } from "../accuracy/sdk/parameterScorer.js"; - -describe("ParameterScorers", () => { - describe("noAdditionsAllowedForPaths", () => { - const scorer = ParameterScorers.noAdditionsAllowedForPaths(["filter", "query"]); - - it("should return 0.75 when no additions are made", () => { - const additions: DifferenceCreate[] = []; - expect(scorer(additions)).toBe(0.75); - }); - - it("should return 0.75 when additions are made to non-protected paths", () => { - const additions: DifferenceCreate[] = [ - { type: "CREATE", path: ["limit"], value: 10 }, - { type: "CREATE", path: ["sort"], value: { name: 1 } }, - ]; - expect(scorer(additions)).toBe(0.75); - }); - - it("should return 0 when additions are made to protected top-level paths", () => { - const additions: DifferenceCreate[] = [{ type: "CREATE", path: ["filter"], value: { name: "test" } }]; - expect(scorer(additions)).toBe(0); - }); - - it("should return 0 when additions are made to deeply nested protected paths", () => { - const additions: DifferenceCreate[] = [{ type: "CREATE", path: ["filter", "age", "$gte"], value: 18 }]; - expect(scorer(additions)).toBe(0); - }); - - it("should handle array indices in paths correctly", () => { - const additions: DifferenceCreate[] = [{ type: "CREATE", path: ["filter", "tags", 0], value: "new-tag" }]; - expect(scorer(additions)).toBe(0); - }); - }); - - describe("emptyAdditionsAllowedForPaths", () => { - const scorer = ParameterScorers.emptyAdditionsAllowedForPaths(["filter", "options"]); - - it("should return 0.75 when no additions are made", () => { - const additions: DifferenceCreate[] = []; - expect(scorer(additions)).toBe(0.75); - }); - - it("should return 0.75 when empty object is added to protected path", () => { - const additions: DifferenceCreate[] = [{ type: "CREATE", path: ["filter"], value: {} }]; - expect(scorer(additions)).toBe(0.75); - }); - - it("should return 0.75 when null is added to protected path", () => { - const additions: DifferenceCreate[] = [{ type: "CREATE", path: ["filter"], value: null }]; - expect(scorer(additions)).toBe(0.75); - }); - - it("should return 0.75 when additions are made to non-protected paths", () => { - const additions: DifferenceCreate[] = [ - { type: "CREATE", path: ["limit"], value: 10 }, - { type: "CREATE", path: ["sort"], value: { name: 1 } }, - ]; - expect(scorer(additions)).toBe(0.75); - }); - - it("should return 0 when non-empty object is added to protected path", () => { - const additions: DifferenceCreate[] = [{ type: "CREATE", path: ["filter"], value: { name: "test" } }]; - expect(scorer(additions)).toBe(0); - }); - - it("should return 0 when non-empty additions are made to nested protected paths", () => { - const additions: DifferenceCreate[] = [{ type: "CREATE", path: ["filter", "name"], value: "test" }]; - expect(scorer(additions)).toBe(0); - }); - }); -}); - -describe("withParameterScorer", () => { - it("should attach scorer to parameters object", () => { - const params = { database: "test", collection: "users" }; - const scorer = ParameterScorers.noAdditionsAllowedForPaths(["filter"]); - - const result = withParameterScorer(params, scorer); - - expect(result.database).toBe("test"); - expect(result.collection).toBe("users"); - expect(result[PARAMETER_SCORER_SYMBOL]).toBe(scorer); - }); -}); From 49bfac45402dd8b95f2cd5bc0c18fbcb1d1c645e Mon Sep 17 00:00:00 2001 From: Nikola Irinchev Date: Fri, 18 Jul 2025 13:29:09 +0200 Subject: [PATCH 88/91] remove microdiff --- package-lock.json | 8 -------- package.json | 1 - 2 files changed, 9 deletions(-) diff --git a/package-lock.json b/package-lock.json index 0481dfa0..7b092c89 100644 --- a/package-lock.json +++ b/package-lock.json @@ -47,7 +47,6 @@ "eslint-config-prettier": "^10.1.5", "eslint-plugin-prettier": "^5.5.1", "globals": "^16.3.0", - "microdiff": "^1.5.0", "mongodb-runner": "^5.9.2", "ollama-ai-provider": "^1.2.0", "openapi-types": "^12.1.3", @@ -9616,13 +9615,6 @@ "node": ">= 0.6" } }, - "node_modules/microdiff": { - "version": "1.5.0", - "resolved": "https://registry.npmjs.org/microdiff/-/microdiff-1.5.0.tgz", - "integrity": "sha512-Drq+/THMvDdzRYrK0oxJmOKiC24ayUV8ahrt8l3oRK51PWt6gdtrIGrlIH3pT/lFh1z93FbAcidtsHcWbnRz8Q==", - "dev": true, - "license": "MIT" - }, "node_modules/micromatch": { "version": "4.0.8", "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz", diff --git a/package.json b/package.json index d2de4e3e..91acf2b0 100644 --- a/package.json +++ b/package.json @@ -51,7 +51,6 @@ "eslint-config-prettier": "^10.1.5", "eslint-plugin-prettier": "^5.5.1", "globals": "^16.3.0", - "microdiff": "^1.5.0", "mongodb-runner": "^5.9.2", "ollama-ai-provider": "^1.2.0", "openapi-types": "^12.1.3", From 356512b4b928bb6d1a80b975ccf95b005392f243 Mon Sep 17 00:00:00 2001 From: Nikola Irinchev Date: Fri, 18 Jul 2025 15:11:48 +0200 Subject: [PATCH 89/91] fix tests --- tests/accuracy/aggregate.test.ts | 4 ++-- tests/accuracy/createIndex.test.ts | 4 ++-- tests/accuracy/find.test.ts | 4 ++-- tests/accuracy/insertMany.test.ts | 9 +++++---- tests/accuracy/logs.test.ts | 3 ++- tests/accuracy/sdk/accuracyScorer.ts | 2 +- tests/accuracy/sdk/matcher.ts | 14 ++++++++------ tests/accuracy/updateMany.test.ts | 4 ++-- tests/integration/tools/mongodb/read/find.test.ts | 2 +- tests/unit/accuracyScorer.test.ts | 10 ++-------- 10 files changed, 27 insertions(+), 29 deletions(-) diff --git a/tests/accuracy/aggregate.test.ts b/tests/accuracy/aggregate.test.ts index f42be7ed..08b1ca61 100644 --- a/tests/accuracy/aggregate.test.ts +++ b/tests/accuracy/aggregate.test.ts @@ -13,9 +13,9 @@ describeAccuracyTests([ pipeline: [ { $group: { _id: "$release_year", count: { $sum: 1 } } }, // For the sake of accuracy, we allow any sort order - Matcher.composite( + Matcher.anyOf( Matcher.undefined, - Matcher.unknown({ + Matcher.value({ $sort: Matcher.anyValue, }) ), diff --git a/tests/accuracy/createIndex.test.ts b/tests/accuracy/createIndex.test.ts index e162c6fa..08326ce3 100644 --- a/tests/accuracy/createIndex.test.ts +++ b/tests/accuracy/createIndex.test.ts @@ -10,7 +10,7 @@ describeAccuracyTests([ parameters: { database: "mflix", collection: "movies", - name: Matcher.composite(Matcher.undefined, Matcher.string()), + name: Matcher.anyOf(Matcher.undefined, Matcher.string()), keys: { release_year: 1, }, @@ -26,7 +26,7 @@ describeAccuracyTests([ parameters: { database: "mflix", collection: "movies", - name: Matcher.composite(Matcher.undefined, Matcher.string()), + name: Matcher.anyOf(Matcher.undefined, Matcher.string()), keys: { title: "text", }, diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts index 1f8b4bc5..f291c46b 100644 --- a/tests/accuracy/find.test.ts +++ b/tests/accuracy/find.test.ts @@ -68,7 +68,7 @@ describeAccuracyTests([ collection: "movies", projection: { title: 1, - _id: Matcher.composite( + _id: Matcher.anyOf( Matcher.undefined, Matcher.number((value) => value === 0) ), @@ -89,7 +89,7 @@ describeAccuracyTests([ filter: { title: "Certain Fish" }, projection: { cast: 1, - _id: Matcher.composite(Matcher.undefined, Matcher.number()), + _id: Matcher.anyOf(Matcher.undefined, Matcher.number()), }, limit: Matcher.number((value) => value > 0), }, diff --git a/tests/accuracy/insertMany.test.ts b/tests/accuracy/insertMany.test.ts index 8c777a53..159072bb 100644 --- a/tests/accuracy/insertMany.test.ts +++ b/tests/accuracy/insertMany.test.ts @@ -1,4 +1,5 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { Matcher } from "./sdk/matcher.js"; describeAccuracyTests([ { @@ -16,15 +17,15 @@ describeAccuracyTests([ documents: [ { id: 1, - title: "name1", + name: "name1", }, { id: 2, - title: "name2", + name: "name2", }, { id: 3, - title: "name3", + name: "name3", }, ], }, @@ -39,7 +40,7 @@ describeAccuracyTests([ parameters: { database: "mflix", collection: "movies", - documents: [{}, {}, {}], + documents: [{ _id: Matcher.anyValue }, { _id: Matcher.anyValue }, { _id: Matcher.anyValue }], }, }, ], diff --git a/tests/accuracy/logs.test.ts b/tests/accuracy/logs.test.ts index 907ea4d2..83c9179b 100644 --- a/tests/accuracy/logs.test.ts +++ b/tests/accuracy/logs.test.ts @@ -1,4 +1,5 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { Matcher } from "./sdk/matcher.js"; describeAccuracyTests([ { @@ -18,7 +19,7 @@ describeAccuracyTests([ { toolName: "mongodb-logs", parameters: { - type: "global", + type: Matcher.anyOf(Matcher.undefined, Matcher.value("global")), limit: 10, }, }, diff --git a/tests/accuracy/sdk/accuracyScorer.ts b/tests/accuracy/sdk/accuracyScorer.ts index 83dc6a16..92c18853 100644 --- a/tests/accuracy/sdk/accuracyScorer.ts +++ b/tests/accuracy/sdk/accuracyScorer.ts @@ -75,7 +75,7 @@ export function calculateToolCallingAccuracy( .map(({ call, index }) => ({ call, index, - score: Matcher.unknown(expectedCall.parameters).match(call.parameters), + score: Matcher.value(expectedCall.parameters).match(call.parameters), })) .filter(({ score }) => score >= 0.75) .sort((a, b) => b.score - a.score || a.index - b.index); diff --git a/tests/accuracy/sdk/matcher.ts b/tests/accuracy/sdk/matcher.ts index 24bf50dc..06999a02 100644 --- a/tests/accuracy/sdk/matcher.ts +++ b/tests/accuracy/sdk/matcher.ts @@ -16,7 +16,7 @@ export abstract class Matcher { return new NumberMatcher(additionalFilter); } - public static composite(...matchers: Matcher[]): Matcher { + public static anyOf(...matchers: Matcher[]): Matcher { return new CompositeMatcher(matchers); } @@ -32,12 +32,12 @@ export abstract class Matcher { return new StringMatcher(); } - public static unknown(expected: unknown): Matcher { + public static value(expected: unknown): Matcher { if (typeof expected === "object" && expected !== null && MATCHER_SYMBOL in expected) { return expected as Matcher; } - return new UnknownMatcher(expected); + return new ValueMatcher(expected); } } @@ -114,7 +114,7 @@ class StringMatcher extends Matcher { } } -class UnknownMatcher extends Matcher { +class ValueMatcher extends Matcher { constructor(private expected: unknown) { super(); } @@ -145,7 +145,7 @@ class UnknownMatcher extends Matcher { } for (let i = 0; i < this.expected.length; i++) { - currentScore = Math.min(currentScore, Matcher.unknown(this.expected[i]).match(actual[i])); + currentScore = Math.min(currentScore, Matcher.value(this.expected[i]).match(actual[i])); if (currentScore === 0) { // If we already found a mismatch, we can stop early return 0; @@ -174,7 +174,7 @@ class UnknownMatcher extends Matcher { for (const key of expectedKeys) { currentScore = Math.min( currentScore, - Matcher.unknown((this.expected as Record)[key]).match( + Matcher.value((this.expected as Record)[key]).match( (actual as Record)[key] ) ); @@ -184,6 +184,8 @@ class UnknownMatcher extends Matcher { return 0; } } + } else { + return 0; } return currentScore; diff --git a/tests/accuracy/updateMany.test.ts b/tests/accuracy/updateMany.test.ts index 66698286..12b36f89 100644 --- a/tests/accuracy/updateMany.test.ts +++ b/tests/accuracy/updateMany.test.ts @@ -15,7 +15,7 @@ describeAccuracyTests([ new_field: 1, }, }, - upsert: Matcher.composite(Matcher.undefined, Matcher.boolean()), + upsert: Matcher.anyOf(Matcher.undefined, Matcher.boolean()), }, }, ], @@ -34,7 +34,7 @@ describeAccuracyTests([ new_field: 1, }, }, - upsert: Matcher.composite(Matcher.undefined, Matcher.boolean()), + upsert: Matcher.anyOf(Matcher.undefined, Matcher.boolean()), }, }, ], diff --git a/tests/integration/tools/mongodb/read/find.test.ts b/tests/integration/tools/mongodb/read/find.test.ts index 5aa378c8..fef79793 100644 --- a/tests/integration/tools/mongodb/read/find.test.ts +++ b/tests/integration/tools/mongodb/read/find.test.ts @@ -34,7 +34,7 @@ describeWithMongoDB("find tool", (integration) => { { name: "sort", description: - "A document, describing the sort order, matching the syntax of the sort argument of cursor.sort()", + "A document, describing the sort order, matching the syntax of the sort argument of cursor.sort(). The keys of the object are the fields to sort on, while the values are the sort directions (1 for ascending, -1 for descending).", type: "object", required: false, }, diff --git a/tests/unit/accuracyScorer.test.ts b/tests/unit/accuracyScorer.test.ts index 61566c55..231251b7 100644 --- a/tests/unit/accuracyScorer.test.ts +++ b/tests/unit/accuracyScorer.test.ts @@ -230,10 +230,7 @@ describe("calculateToolCallingAccuracy", () => { parameters: { database: "mflix", collection: "movies", - pipeline: [ - { $match: { genre: "Horror" } }, - Matcher.composite(Matcher.undefined, Matcher.anyValue), - ], + pipeline: [{ $match: { genre: "Horror" } }, Matcher.anyOf(Matcher.undefined, Matcher.anyValue)], }, }, ]; @@ -261,10 +258,7 @@ describe("calculateToolCallingAccuracy", () => { parameters: { database: "mflix", collection: "movies", - pipeline: [ - { $match: { genre: "Horror" } }, - Matcher.composite(Matcher.undefined, Matcher.anyValue), - ], + pipeline: [{ $match: { genre: "Horror" } }, Matcher.anyOf(Matcher.undefined, Matcher.anyValue)], }, }, ]; From 8a5a9d2c49777a8495c97d10d5e4ddd67c382f30 Mon Sep 17 00:00:00 2001 From: Nikola Irinchev Date: Fri, 18 Jul 2025 15:16:23 +0200 Subject: [PATCH 90/91] don't omit fields for MongoDB storage --- tests/accuracy/sdk/accuracyResultStorage/mongodbStorage.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/accuracy/sdk/accuracyResultStorage/mongodbStorage.ts b/tests/accuracy/sdk/accuracyResultStorage/mongodbStorage.ts index 463a54f6..be11aeb3 100644 --- a/tests/accuracy/sdk/accuracyResultStorage/mongodbStorage.ts +++ b/tests/accuracy/sdk/accuracyResultStorage/mongodbStorage.ts @@ -8,8 +8,10 @@ import { ModelResponse, } from "./resultStorage.js"; -// Omitting these as they might contain large chunk of texts -const OMITTED_MODEL_RESPONSE_FIELDS: (keyof ModelResponse)[] = ["messages", "text"]; +// We could decide to omit some fields from the model response to reduce the size of the stored results. Since +// so far, the responses are not too big, we do not omit any fields, but if we decide to do so in the future, +// we could add `"messages"` and `"text"` to this list. +const OMITTED_MODEL_RESPONSE_FIELDS: (keyof ModelResponse)[] = []; export class MongoDBBasedResultStorage implements AccuracyResultStorage { private client: MongoClient; From 2d4e7505e5cb29b7293d0bbd4eaefd668ad35a68 Mon Sep 17 00:00:00 2001 From: Nikola Irinchev Date: Fri, 18 Jul 2025 15:56:55 +0200 Subject: [PATCH 91/91] fix test coverage --- vitest.config.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vitest.config.ts b/vitest.config.ts index 2a42cecb..239650ac 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -17,7 +17,7 @@ export default defineConfig({ hookTimeout: 3600000, setupFiles: ["./tests/setup.ts"], coverage: { - exclude: ["node_modules", "tests", "dist"], + exclude: ["node_modules", "tests", "dist", "vitest.config.ts", "scripts"], reporter: ["lcov"], }, projects: [

Expected Tool Calls LLM Tool Calls AccuracyBaseline Accuracy LLM Response Time (ms) Total Tokens Used
${formatBaselineAccuracy(snapshotEntry)} ${snapshotEntry.llmResponseTime.toFixed(2)} ${formatTokenUsage(snapshotEntry.tokensUsage || {})}
+

🤖 LLM Response

@@ -97,9 +187,7 @@ async function generateHtmlReport(snapshotEntries: AccuracySnapshotEntry[], accu ) .join(""); - // Read template file const template = await readFile(HTML_TESTS_SUMMARY_TEMPLATE, "utf8"); - // Fill template return populateTemplate(template, { accuracyRunId, runStatus, @@ -110,6 +198,12 @@ async function generateHtmlReport(snapshotEntries: AccuracySnapshotEntry[], accu totalTests: String(totalPrompts), modelsCount: String(modelsCount), testsWithZeroAccuracy: String(testsWithZeroAccuracy.length), + averageAccuracy: formatAccuracy(averageAccuracy), + baselineCommitSHA: baselineInfo?.commitSHA || "N/A", + baselineAccuracyRunId: baselineInfo?.accuracyRunId || "N/A", + baselineCreatedOn: baselineInfo?.createdOn || "N/A", + evalsImproved: String(evalsImproved), + evalsRegressed: String(evalsRegressed), tableRows, }); } @@ -117,36 +211,74 @@ async function generateHtmlReport(snapshotEntries: AccuracySnapshotEntry[], accu async function generateTestSummary(): Promise { try { const accuracyRunId = process.env.MDB_ACCURACY_RUN_ID; + const baselineCommitSHA = process.env.MDB_ACCURACY_BASELINE_COMMIT; + if (!accuracyRunId) { throw new Error("Cannot generate test summary, accuracy run id is unknown"); } console.log(`\n📊 Generating test summary for accuracy run: ${accuracyRunId}\n`); const storage = await getAccuracySnapshotStorage(); - const snapshot = await storage.getSnapshotForAccuracyRun(accuracyRunId); - await storage.close(); + const currentSnapshot = await storage.getSnapshotForAccuracyRun(accuracyRunId); - if (snapshot.length === 0) { - console.log("No snapshots found for the current run."); + if (currentSnapshot.length === 0) { + console.log("No snapshot entries found for the current run."); + await storage.close(); return; } - const htmlReport = await generateHtmlReport(snapshot, accuracyRunId); + let snapshotWithBaseline: SnapshotEntryWithBaseline[] = currentSnapshot; + let baselineInfo: { commitSHA: string; accuracyRunId: string; createdOn: string } | undefined; + + if (baselineCommitSHA) { + console.log(`🔍 Fetching baseline snapshot entries for commit: ${baselineCommitSHA}`); + const baselineSnapshot = await storage.getLatestSnapshotForCommit(baselineCommitSHA); + + if (baselineSnapshot.length > 0) { + console.log(`✅ Found ${baselineSnapshot.length} baseline snapshot entries.`); + snapshotWithBaseline = compareSnapshotEntries(currentSnapshot, baselineSnapshot); + + const firstBaselineSnapshot = baselineSnapshot[0]; + if (firstBaselineSnapshot) { + baselineInfo = { + commitSHA: firstBaselineSnapshot.commitSHA, + accuracyRunId: firstBaselineSnapshot.accuracyRunId, + createdOn: firstBaselineSnapshot.createdOn + ? new Date(firstBaselineSnapshot.createdOn).toLocaleString() + : "unknown", + }; + } + } else { + console.log(`⚠️ No baseline snapshots found for commit: ${baselineCommitSHA}`); + } + } + + const htmlReport = await generateHtmlReport(snapshotWithBaseline, accuracyRunId, baselineInfo); + await storage.close(); const reportPath = HTML_TESTS_SUMMARY_FILE; await writeFile(reportPath, htmlReport, "utf8"); console.log(`✅ HTML report generated: ${reportPath}`); - const totalPrompts = snapshot.length; - const modelsCount = new Set(snapshot.map((s) => `${s.provider} ${s.requestedModel}`)).size; - const testsWithZeroAccuracy = snapshot.filter((snapshotEntry) => snapshotEntry.toolCallingAccuracy === 0); + const totalPrompts = snapshotWithBaseline.length; + const modelsCount = new Set(snapshotWithBaseline.map((s) => `${s.provider} ${s.requestedModel}`)).size; + const testsWithZeroAccuracy = snapshotWithBaseline.filter( + (snapshotEntry) => snapshotEntry.toolCallingAccuracy === 0 + ); + const evalsImproved = snapshotWithBaseline.filter((s) => s.baseline?.comparisonResult === "improved").length; + const evalsRegressed = snapshotWithBaseline.filter((s) => s.baseline?.comparisonResult === "regressed").length; console.log(`\n📈 Summary:`); console.log(` Total prompts evaluated: ${totalPrompts}`); console.log(` Models tested: ${modelsCount}`); console.log(` Evals with 0% accuracy: ${testsWithZeroAccuracy.length}`); - console.log(` Report saved to: ${reportPath}\n`); + + if (baselineCommitSHA) { + console.log(` Baseline commit: ${baselineCommitSHA}`); + console.log(` Evals improved vs baseline: ${evalsImproved}`); + console.log(` Evals regressed vs baseline: ${evalsRegressed}`); + } } catch (error) { console.error("Error generating test summary:", error); process.exit(1); diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts index d3b1b56a..960daffc 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts @@ -61,7 +61,7 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { private async getLatestAccuracyRunForCommit(commit: string): Promise { const document = await this.snapshotCollection.findOne( - { commit: commit, accuracyRunStatus: AccuracyRunStatus.Done }, + { commitSHA: commit, accuracyRunStatus: AccuracyRunStatus.Done }, { sort: { createdOn: -1 }, projection: { accuracyRunId: 1 } } ); From bad3012f8125741b1237bdf5bebad22943945f57 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Fri, 11 Jul 2025 15:05:59 +0200 Subject: [PATCH 44/91] Update .github/workflows/accuracy-tests.yml Co-authored-by: Nikola Irinchev --- .github/workflows/accuracy-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index 640fdd1a..f60f416f 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -16,7 +16,7 @@ jobs: github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests') env: - MDB_OPEN_AI_API_KEY: ${{ secrets.MDB_OPEN_AI_API_KEY }} + MDB_OPEN_AI_API_KEY: ${{ secrets.ACCURACY_OPEN_AI_API_KEY }} MDB_GEMINI_API_KEY: ${{ secrets.MDB_GEMINI_API_KEY }} MDB_AZURE_OPEN_AI_API_KEY: ${{ secrets.MDB_AZURE_OPEN_AI_API_KEY }} MDB_AZURE_OPEN_AI_API_URL: ${{ secrets.MDB_AZURE_OPEN_AI_API_URL }} From bc6e7558b5a768a8a47b6cb645d87670ef529b23 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Fri, 11 Jul 2025 15:06:10 +0200 Subject: [PATCH 45/91] Update .github/workflows/accuracy-tests.yml Co-authored-by: Nikola Irinchev --- .github/workflows/accuracy-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index f60f416f..b7b296f0 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -22,7 +22,7 @@ jobs: MDB_AZURE_OPEN_AI_API_URL: ${{ secrets.MDB_AZURE_OPEN_AI_API_URL }} MDB_ACCURACY_MDB_URL: ${{ secrets.MDB_ACCURACY_MDB_URL }} MDB_ACCURACY_MDB_DB: ${{ secrets.MDB_ACCURACY_MDB_DB }} - MDB_ACCURACY_MDB_COLLECTION: ${{ secrets.MDB_ACCURACY_MDB_COLLECTION }} + MDB_ACCURACY_MDB_COLLECTION: ${{ vars.ACCURACY_MDB_COLLECTION }} MDB_ACCURACY_BASELINE_COMMIT: ${{ github.event.pull_request.base.sha || '' }} steps: - uses: GitHubSecurityLab/actions-permissions/monitor@v1 From 3e094fabf4f8d3d4520c39fe0017be59eb8ece55 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Fri, 11 Jul 2025 15:06:17 +0200 Subject: [PATCH 46/91] Update .github/workflows/accuracy-tests.yml Co-authored-by: Nikola Irinchev --- .github/workflows/accuracy-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index b7b296f0..6f99eab7 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -21,7 +21,7 @@ jobs: MDB_AZURE_OPEN_AI_API_KEY: ${{ secrets.MDB_AZURE_OPEN_AI_API_KEY }} MDB_AZURE_OPEN_AI_API_URL: ${{ secrets.MDB_AZURE_OPEN_AI_API_URL }} MDB_ACCURACY_MDB_URL: ${{ secrets.MDB_ACCURACY_MDB_URL }} - MDB_ACCURACY_MDB_DB: ${{ secrets.MDB_ACCURACY_MDB_DB }} + MDB_ACCURACY_MDB_DB: ${{ vars.ACCURACY_MDB_DB }} MDB_ACCURACY_MDB_COLLECTION: ${{ vars.ACCURACY_MDB_COLLECTION }} MDB_ACCURACY_BASELINE_COMMIT: ${{ github.event.pull_request.base.sha || '' }} steps: From dca7217c333523ad78438ddd5637e7bd243bd30b Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Fri, 11 Jul 2025 15:06:35 +0200 Subject: [PATCH 47/91] Update .github/workflows/accuracy-tests.yml Co-authored-by: Nikola Irinchev --- .github/workflows/accuracy-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index 6f99eab7..75dac32c 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -20,7 +20,7 @@ jobs: MDB_GEMINI_API_KEY: ${{ secrets.MDB_GEMINI_API_KEY }} MDB_AZURE_OPEN_AI_API_KEY: ${{ secrets.MDB_AZURE_OPEN_AI_API_KEY }} MDB_AZURE_OPEN_AI_API_URL: ${{ secrets.MDB_AZURE_OPEN_AI_API_URL }} - MDB_ACCURACY_MDB_URL: ${{ secrets.MDB_ACCURACY_MDB_URL }} + MDB_ACCURACY_MDB_URL: ${{ secrets.ACCURACY_MDB_CONNECTION_STRING }} MDB_ACCURACY_MDB_DB: ${{ vars.ACCURACY_MDB_DB }} MDB_ACCURACY_MDB_COLLECTION: ${{ vars.ACCURACY_MDB_COLLECTION }} MDB_ACCURACY_BASELINE_COMMIT: ${{ github.event.pull_request.base.sha || '' }} From 05c81c0c59f79c65a27e5a6508b9795e2c2549b8 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Fri, 11 Jul 2025 15:25:13 +0200 Subject: [PATCH 48/91] chore: secrets as per conventions --- .github/workflows/accuracy-tests.yml | 8 +++---- scripts/run-accuracy-tests.sh | 2 -- .../mdb-snapshot-storage.ts | 22 ++++--------------- 3 files changed, 7 insertions(+), 25 deletions(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index 75dac32c..bd20a4c8 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -17,12 +17,10 @@ jobs: (github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests') env: MDB_OPEN_AI_API_KEY: ${{ secrets.ACCURACY_OPEN_AI_API_KEY }} - MDB_GEMINI_API_KEY: ${{ secrets.MDB_GEMINI_API_KEY }} - MDB_AZURE_OPEN_AI_API_KEY: ${{ secrets.MDB_AZURE_OPEN_AI_API_KEY }} - MDB_AZURE_OPEN_AI_API_URL: ${{ secrets.MDB_AZURE_OPEN_AI_API_URL }} + MDB_GEMINI_API_KEY: ${{ secrets.ACCURACY_GEMINI_API_KEY }} + MDB_AZURE_OPEN_AI_API_KEY: ${{ secrets.ACCURACY_AZURE_OPEN_AI_API_KEY }} + MDB_AZURE_OPEN_AI_API_URL: ${{ secrets.ACCURACY_AZURE_OPEN_AI_API_URL }} MDB_ACCURACY_MDB_URL: ${{ secrets.ACCURACY_MDB_CONNECTION_STRING }} - MDB_ACCURACY_MDB_DB: ${{ vars.ACCURACY_MDB_DB }} - MDB_ACCURACY_MDB_COLLECTION: ${{ vars.ACCURACY_MDB_COLLECTION }} MDB_ACCURACY_BASELINE_COMMIT: ${{ github.event.pull_request.base.sha || '' }} steps: - uses: GitHubSecurityLab/actions-permissions/monitor@v1 diff --git a/scripts/run-accuracy-tests.sh b/scripts/run-accuracy-tests.sh index ae02dd06..db69639e 100644 --- a/scripts/run-accuracy-tests.sh +++ b/scripts/run-accuracy-tests.sh @@ -10,8 +10,6 @@ export MDB_ACCURACY_RUN_ID=$(npx uuid v4) # For providing a mongodb based storage to store accuracy snapshots # export MDB_ACCURACY_MDB_URL="" -# export MDB_ACCURACY_MDB_DB="" -# export MDB_ACCURACY_MDB_COLLECTION="" # By default we run all the tests under tests/accuracy folder unless a path is # specified in the command line. Such as: diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts index 960daffc..a3915fdc 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts @@ -10,17 +10,9 @@ import { export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { private readonly client: MongoClient; private readonly snapshotCollection: Collection; - private constructor({ - mongodbUrl, - database, - collection, - }: { - mongodbUrl: string; - database: string; - collection: string; - }) { + private constructor(mongodbUrl: string) { this.client = new MongoClient(mongodbUrl); - this.snapshotCollection = this.client.db(database).collection(collection); + this.snapshotCollection = this.client.db("mongodb-mcp-server").collection("accuracy-tests"); } async createSnapshotEntry( @@ -81,16 +73,10 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { static getStorage(): MongoDBSnapshotStorage | null { const mongodbUrl = process.env.MDB_ACCURACY_MDB_URL; - const database = process.env.MDB_ACCURACY_MDB_DB; - const collection = process.env.MDB_ACCURACY_MDB_COLLECTION; - if (!mongodbUrl || !database || !collection) { + if (!mongodbUrl) { return null; } - return new MongoDBSnapshotStorage({ - mongodbUrl, - database, - collection, - }); + return new MongoDBSnapshotStorage(mongodbUrl); } } From e47922f64bfe46f0c118b4ff0824f9075ab1e7cf Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 14 Jul 2025 01:26:09 +0200 Subject: [PATCH 49/91] chore: updated how we store accuracy result Instead of storing multiple documents per accuracy test run(one for each prompt+model response), we will now be storing one document for accuracy result and under that, all the prompt+model responses will be nested. --- package.json | 2 +- resources/test-summary-template.html | 38 +- scripts/{ => accuracy}/run-accuracy-tests.sh | 17 +- .../update-accuracy-run-status.ts | 11 +- scripts/generate-test-summary.ts | 388 +++++++++--------- tests/accuracy/create-collection.test.ts | 2 +- tests/accuracy/drop-collection.test.ts | 2 +- tests/accuracy/drop-database.test.ts | 2 +- tests/accuracy/logs.test.ts | 2 +- .../accuracy-result-storage/disk-storage.ts | 169 ++++++++ .../get-accuracy-result-storage.ts | 10 + .../mongodb-storage.ts | 103 +++++ .../accuracy-result-storage/result-storage.ts | 116 ++++++ tests/accuracy/sdk/accuracy-scorer.ts | 2 +- .../disk-snapshot-storage.ts | 117 ------ .../get-snapshot-storage.ts | 7 - .../mdb-snapshot-storage.ts | 82 ---- .../snapshot-storage.ts | 127 ------ tests/accuracy/sdk/accuracy-testing-client.ts | 2 +- tests/accuracy/sdk/constants.ts | 4 +- tests/accuracy/sdk/describe-accuracy-tests.ts | 22 +- tests/unit/accuracy-scorer.test.ts | 2 +- 22 files changed, 644 insertions(+), 583 deletions(-) rename scripts/{ => accuracy}/run-accuracy-tests.sh (66%) rename scripts/{ => accuracy}/update-accuracy-run-status.ts (51%) create mode 100644 tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts create mode 100644 tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.ts create mode 100644 tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts create mode 100644 tests/accuracy/sdk/accuracy-result-storage/result-storage.ts delete mode 100644 tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts delete mode 100644 tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts delete mode 100644 tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts delete mode 100644 tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts diff --git a/package.json b/package.json index 612671be..fdd48f9c 100644 --- a/package.json +++ b/package.json @@ -31,7 +31,7 @@ "generate": "./scripts/generate.sh", "test": "vitest --coverage", "pre:test:accuracy": "npm run build:compile", - "test:accuracy": "sh ./scripts/run-accuracy-tests.sh" + "test:accuracy": "sh ./scripts/accuracy/run-accuracy-tests.sh" }, "license": "Apache-2.0", "devDependencies": { diff --git a/resources/test-summary-template.html b/resources/test-summary-template.html index 903457f8..a5c14f33 100644 --- a/resources/test-summary-template.html +++ b/resources/test-summary-template.html @@ -167,28 +167,28 @@ font-family: "Monaco", "Menlo", monospace; font-size: 12px; max-height: 400px; + max-width: 1300px; overflow-y: auto; } - .accuracy-perfect { - background-color: #d4edda; - color: #155724; + .run-status { + text-transform: capitalize; + } + .chip { padding: 2px 6px; border-radius: 3px; font-weight: bold; } - .accuracy-good { + .perfect { + background-color: #d4edda; + color: #155724; + } + .good { background-color: #fff3cd; color: #856404; - padding: 2px 6px; - border-radius: 3px; - font-weight: bold; } - .accuracy-poor { + .poor { background-color: #f8d7da; color: #721c24; - padding: 2px 6px; - border-radius: 3px; - font-weight: bold; } .tool-call { background: #e9ecef; @@ -303,13 +303,17 @@

📊 MongoDB MCP Server - Accuracy Test Summary

📊 Current Run Information

+
+
Commit SHA
+
{{commitSHA}}
+
Accuracy Run ID
{{accuracyRunId}}
-
Commit SHA
-
{{commitSHA}}
+
Accuracy Run Status
+
{{accuracyRunStatus}}
Run Created On
@@ -347,13 +351,17 @@

📈 Test Results Summary

🔄 Baseline Comparison

+
+
Baseline Commit SHA
+
{{baselineCommitSHA}}
+
Baseline Accuracy Run ID
{{baselineAccuracyRunId}}
-
Baseline Commit SHA
-
{{baselineCommitSHA}}
+
Baseline Accuracy Run Status
+
{{baselineAccuracyRunStatus}}
Baseline Run Created On
diff --git a/scripts/run-accuracy-tests.sh b/scripts/accuracy/run-accuracy-tests.sh similarity index 66% rename from scripts/run-accuracy-tests.sh rename to scripts/accuracy/run-accuracy-tests.sh index db69639e..10ae6192 100644 --- a/scripts/run-accuracy-tests.sh +++ b/scripts/accuracy/run-accuracy-tests.sh @@ -8,7 +8,7 @@ export MDB_ACCURACY_RUN_ID=$(npx uuid v4) # export MDB_AZURE_OPEN_AI_API_KEY="" # export MDB_AZURE_OPEN_AI_API_URL="" -# For providing a mongodb based storage to store accuracy snapshots +# For providing a mongodb based storage to store accuracy result # export MDB_ACCURACY_MDB_URL="" # By default we run all the tests under tests/accuracy folder unless a path is @@ -16,31 +16,30 @@ export MDB_ACCURACY_RUN_ID=$(npx uuid v4) # npm run test:accuracy -- tests/accuracy/some-test.test.ts TEST_PATH_PATTERN="${1:-tests/accuracy}" shift || true +echo "Running accuracy tests with MDB_ACCURACY_RUN_ID '$MDB_ACCURACY_RUN_ID' and TEST_PATH_PATTERN '$TEST_PATH_PATTERN'" node --experimental-vm-modules node_modules/jest/bin/jest.js --bail --testPathPatterns "$TEST_PATH_PATTERN" "$@" # Preserving the exit code from test run to correctly notify in the CI # environments when the tests fail. JEST_EXIT_CODE=$? -# Each test run submits an accuracy snapshot entry with the accuracyRunStatus: +# Each test run submits an accuracy result with the accuracyRunStatus: # "in-progress". When all the tests are done and jest exits with an exit code of # 0, we can safely mark accuracy run as finished otherwise failed. # This "outside-the-tests-status-update" is arising out of the fact that each # test suite stores their own accuracy run data in the storage and this setup # might lead to data inconsistency when the tests fail. To overcome that each -# accuracy snapshot entry has a status which by default is "in-progress" and is +# accuracy result entry has a status which by default is "in-progress" and is # updated when the tests either pass (all our accuracy tests are supposed to # pass unless some errors occurs during the test runs), or fail. # This is necessary when comparing one accuracy run with another as we wouldn't # want to compare against an incomplete run. -if [ $JEST_EXIT_CODE -eq 0 ]; then - MDB_ACCURACY_RUN_STATUS="done" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'done'" - npx tsx scripts/generate-test-summary.ts || echo "Warning: Failed to generate test summary HTML report" -else - MDB_ACCURACY_RUN_STATUS="failed" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'failed'" -fi +export MDB_ACCURACY_RUN_STATUS=$([ $JEST_EXIT_CODE -eq 0 ] && echo "done" || echo "failed") +npx tsx scripts/accuracy/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to '$MDB_ACCURACY_RUN_STATUS'" +# This is optional but we do it anyways to generate a readable summary of report. +npx tsx scripts/generate-test-summary.ts || echo "Warning: Failed to generate test summary HTML report" exit $JEST_EXIT_CODE \ No newline at end of file diff --git a/scripts/update-accuracy-run-status.ts b/scripts/accuracy/update-accuracy-run-status.ts similarity index 51% rename from scripts/update-accuracy-run-status.ts rename to scripts/accuracy/update-accuracy-run-status.ts index 6d8e3895..344ed86d 100644 --- a/scripts/update-accuracy-run-status.ts +++ b/scripts/accuracy/update-accuracy-run-status.ts @@ -1,18 +1,21 @@ -import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js"; -import { AccuracyRunStatus } from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; +import { getAccuracyResultStorage } from "../../tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.js"; +import { AccuracyRunStatus } from "../../tests/accuracy/sdk/accuracy-result-storage/result-storage.js"; +import { getCommitSHA } from "../../tests/accuracy/sdk/git-info.js"; const envAccuracyRunId = process.env.MDB_ACCURACY_RUN_ID; const envAccuracyRunStatus = process.env.MDB_ACCURACY_RUN_STATUS; +const commitSHA = await getCommitSHA(); if ( !envAccuracyRunId || + !commitSHA || (envAccuracyRunStatus !== AccuracyRunStatus.Done && envAccuracyRunStatus !== AccuracyRunStatus.Failed) ) { process.exit(1); } console.time(`Marked accuracy run id - ${envAccuracyRunId} as ${envAccuracyRunStatus} in`); -const storage = await getAccuracySnapshotStorage(); -await storage.updateAccuracyRunStatus(envAccuracyRunId, envAccuracyRunStatus); +const storage = getAccuracyResultStorage(); +await storage.updateRunStatus(commitSHA, envAccuracyRunId, envAccuracyRunStatus); await storage.close(); console.timeEnd(`Marked accuracy run id - ${envAccuracyRunId} as ${envAccuracyRunStatus} in`); diff --git a/scripts/generate-test-summary.ts b/scripts/generate-test-summary.ts index fba40610..78eadd91 100644 --- a/scripts/generate-test-summary.ts +++ b/scripts/generate-test-summary.ts @@ -1,33 +1,55 @@ import { readFile, writeFile } from "fs/promises"; -import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js"; -import { HTML_TESTS_SUMMARY_FILE, HTML_TESTS_SUMMARY_TEMPLATE } from "../tests/accuracy/sdk/constants.js"; -import type { - AccuracySnapshotEntry, +import { getAccuracyResultStorage } from "../tests/accuracy/sdk/accuracy-result-storage/get-accuracy-result-storage.js"; +import { + AccuracyResult, + AccuracyRunStatuses, ExpectedToolCall, LLMToolCall, -} from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; + ModelResponse, +} from "../tests/accuracy/sdk/accuracy-result-storage/result-storage.js"; +import { getCommitSHA } from "../tests/accuracy/sdk/git-info.js"; +import { HTML_TESTS_SUMMARY_FILE, HTML_TESTS_SUMMARY_TEMPLATE } from "../tests/accuracy/sdk/constants.js"; + +type ComparableAccuracyResult = Omit & { + promptAndModelResponses: PromptAndModelResponse[]; +}; -interface BaselineComparison { - baselineAccuracy?: number; - comparisonResult?: "improved" | "regressed" | "same"; +interface PromptAndModelResponse extends ModelResponse { + prompt: string; + baselineToolAccuracy?: number; } -interface SnapshotEntryWithBaseline extends AccuracySnapshotEntry { - baseline?: BaselineComparison; +interface BaselineRunInfo { + commitSHA: string; + accuracyRunId: string; + accuracyRunStatus: AccuracyRunStatuses; + createdOn: string; } function populateTemplate(template: string, data: Record): string { return template.replace(/\{\{(\w+)\}\}/g, (_, key: string) => data[key] ?? ""); } +function formatRunStatus(status: AccuracyRunStatuses) { + let statusClass = "chip run-status"; + if (status === "done") { + statusClass += " perfect"; + } else if (status === "in-progress") { + statusClass += " poor"; + } else if (status === "failed") { + statusClass += " poor"; + } + return `${status}`; +} + function formatAccuracy(accuracy: number): string { return (accuracy * 100).toFixed(1) + "%"; } function getAccuracyClass(accuracy: number): string { - if (accuracy === 1) return "accuracy-perfect"; - if (accuracy >= 0.75) return "accuracy-good"; - return "accuracy-poor"; + if (accuracy === 1) return "chip perfect"; + if (accuracy >= 0.75) return "chip good"; + return "chip poor"; } function formatToolCallsWithTooltip(toolCalls: ExpectedToolCall[] | LLMToolCall[]): string { @@ -44,9 +66,9 @@ function formatTokenUsage(tokensUsage: { completionTokens?: number; totalTokens?: number; }): string { - const total = tokensUsage.totalTokens || 0; - const prompt = tokensUsage.promptTokens || 0; - const completion = tokensUsage.completionTokens || 0; + const total = tokensUsage.totalTokens || "-"; + const prompt = tokensUsage.promptTokens || "-"; + const completion = tokensUsage.completionTokens || "-"; const tooltip = `Prompt: ${prompt}\nCompletion: ${completion}\nTotal: ${total}`; return `${total}`; @@ -56,232 +78,194 @@ function formatMessages(messages: Array>): string { return messages.map((msg) => JSON.stringify(msg, null, 2)).join("\n\n"); } -function formatBaselineAccuracy(snapshot: SnapshotEntryWithBaseline): string { - if (!snapshot.baseline || snapshot.baseline.baselineAccuracy === undefined) { - return 'N/A'; - } - - const baselineAccuracyText = formatAccuracy(snapshot.baseline.baselineAccuracy); - let comparisonClass = "accuracy-comparison"; +function formatCurrentAccuracy(response: PromptAndModelResponse): string { + const currentAccuracyText = formatAccuracy(response.toolCallingAccuracy); + const comparisonClass = getAccuracyClass(response.toolCallingAccuracy); let comparisonIcon = ""; - if (snapshot.baseline.comparisonResult) { - switch (snapshot.baseline.comparisonResult) { - case "improved": - comparisonClass += " accuracy-improved"; - comparisonIcon = " ↗"; - break; - case "regressed": - comparisonClass += " accuracy-regressed"; - comparisonIcon = " ↘"; - break; - case "same": - comparisonClass += " accuracy-same"; - comparisonIcon = " →"; - break; + if (typeof response.baselineToolAccuracy === "number") { + if (response.toolCallingAccuracy > response.baselineToolAccuracy) { + comparisonIcon = " ↗"; + } else if (response.toolCallingAccuracy < response.baselineToolAccuracy) { + comparisonIcon = " ↘"; + } else { + comparisonIcon = " →"; } } - return `${baselineAccuracyText}${comparisonIcon}`; + return `${currentAccuracyText}${comparisonIcon}`; } -function compareSnapshotEntries( - currentSnapshotEntries: AccuracySnapshotEntry[], - baselineSnapshotEntries: AccuracySnapshotEntry[] -): SnapshotEntryWithBaseline[] { - const baselineMap = new Map(); - baselineSnapshotEntries.forEach((entry) => { - const key = `${entry.provider}|${entry.requestedModel}|${entry.prompt}`; - baselineMap.set(key, entry); - }); - - return currentSnapshotEntries.map((entry) => { - const key = `${entry.provider}|${entry.requestedModel}|${entry.prompt}`; - const baselineEntry = baselineMap.get(key); - - if (!baselineEntry) { - return entry; - } - - let comparisonResult: "improved" | "regressed" | "same"; - if (entry.toolCallingAccuracy > baselineEntry.toolCallingAccuracy) { - comparisonResult = "improved"; - } else if (entry.toolCallingAccuracy < baselineEntry.toolCallingAccuracy) { - comparisonResult = "regressed"; - } else { - comparisonResult = "same"; - } +function formatBaselineAccuracy(response: PromptAndModelResponse): string { + if (response.baselineToolAccuracy === null || response.baselineToolAccuracy === undefined) { + return 'N/A'; + } + return `${formatAccuracy(response.baselineToolAccuracy)}`; +} - return { - ...entry, - baseline: { - baselineAccuracy: baselineEntry.toolCallingAccuracy, - comparisonResult, - }, - }; - }); +function getTestSummary(comparableResult: ComparableAccuracyResult) { + const responses = comparableResult.promptAndModelResponses; + return { + totalPrompts: new Set(responses.map((r) => r.prompt)).size, + totalModels: new Set(responses.map((r) => `${r.provider} ${r.requestedModel}`)).size, + testsWithZeroAccuracy: responses.filter((r) => r.toolCallingAccuracy === 0), + testsWith75Accuracy: responses.filter((r) => r.toolCallingAccuracy === 0.75), + testsWith100Accuracy: responses.filter((r) => r.toolCallingAccuracy === 100), + averageAccuracy: + responses.length > 0 ? responses.reduce((sum, r) => sum + r.toolCallingAccuracy, 0) / responses.length : 0, + evalsImproved: responses.filter( + (r) => typeof r.baselineToolAccuracy === "number" && r.toolCallingAccuracy > r.baselineToolAccuracy + ).length, + evalsRegressed: responses.filter( + (r) => typeof r.baselineToolAccuracy === "number" && r.toolCallingAccuracy < r.baselineToolAccuracy + ).length, + reportGeneratedOn: new Date().toLocaleString(), + resultCreatedOn: new Date(comparableResult.createdOn).toLocaleString(), + }; } async function generateHtmlReport( - snapshotEntries: SnapshotEntryWithBaseline[], - accuracyRunId: string, - baselineInfo?: { - commitSHA: string; - accuracyRunId: string; - createdOn: string; - } + comparableResult: ComparableAccuracyResult, + testSummary: ReturnType, + baselineInfo: BaselineRunInfo | null ): Promise { - const totalPrompts = snapshotEntries.length; - const modelsCount = new Set(snapshotEntries.map((s) => `${s.provider} ${s.requestedModel}`)).size; - const testsWithZeroAccuracy = snapshotEntries.filter((snapshotEntry) => snapshotEntry.toolCallingAccuracy === 0); - - const totalAccuracy = snapshotEntries.reduce((sum, entry) => sum + entry.toolCallingAccuracy, 0); - const averageAccuracy = totalPrompts > 0 ? totalAccuracy / totalPrompts : 0; - - const evalsImproved = snapshotEntries.filter((s) => s.baseline?.comparisonResult === "improved").length; - const evalsRegressed = snapshotEntries.filter((s) => s.baseline?.comparisonResult === "regressed").length; - - const firstSnapshotEntry = snapshotEntries[0]; - const runStatus = firstSnapshotEntry?.accuracyRunStatus || "unknown"; - const commitSHA = firstSnapshotEntry?.commitSHA || "unknown"; - const createdOn = firstSnapshotEntry?.createdOn - ? new Date(firstSnapshotEntry.createdOn).toLocaleString() - : "unknown"; - const reportGeneratedOn = new Date().toLocaleString(); - - const tableRows = snapshotEntries + const responses = comparableResult.promptAndModelResponses; + const tableRows = responses .map( - (snapshotEntry, index) => ` -
- - ${snapshotEntry.prompt} - ${snapshotEntry.provider} - ${snapshotEntry.requestedModel}${formatToolCallsWithTooltip(snapshotEntry.expectedToolCalls)}${formatToolCallsWithTooltip(snapshotEntry.actualToolCalls)} - - ${formatAccuracy(snapshotEntry.toolCallingAccuracy)} - - ${formatBaselineAccuracy(snapshotEntry)}${snapshotEntry.llmResponseTime.toFixed(2)}${formatTokenUsage(snapshotEntry.tokensUsage || {})}
-
-
-

🤖 LLM Response

-
${snapshotEntry.text}
-
-
-

💬 Conversation Messages

-
${formatMessages(snapshotEntry.messages)}
-
+ (response, index) => ` +
+ + ${response.prompt} + ${response.provider} - ${response.requestedModel}${formatToolCallsWithTooltip(response.expectedToolCalls)}${formatToolCallsWithTooltip(response.llmToolCalls)}${formatCurrentAccuracy(response)}${formatBaselineAccuracy(response)}${response.llmResponseTime.toFixed(2)}${formatTokenUsage(response.tokensUsed || {})}
+
+
+

🤖 LLM Response

+
${response.text || "N/A"}
+
+
+

💬 Conversation Messages

+
${formatMessages(response.messages || [])}
-