chore: make expectedToolCalls part of PromptResult

himanshusinghs · himanshusinghs · commit cb5178f1f350 · 2025-07-16T13:27:09.000+02:00
diff --git a/scripts/accuracy/generate-test-summary.ts b/scripts/accuracy/generate-test-summary.ts
@@ -21,6 +21,7 @@ type ComparableAccuracyResult = Omit<AccuracyResult, "promptResults"> & {
 
 interface PromptAndModelResponse extends ModelResponse {
     prompt: string;
+    expectedToolCalls: ExpectedToolCall[];
     baselineToolAccuracy?: number;
 }
 
@@ -293,6 +294,7 @@ async function generateTestSummary() {
                         return {
                             ...currentModelResponse,
                             prompt: currentPromptResult.prompt,
+                            expectedToolCalls: currentPromptResult.expectedToolCalls,
                             baselineToolAccuracy: baselineModelResponse?.toolCallingAccuracy,
                         };
                     });
diff --git a/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/disk-storage.ts
@@ -7,6 +7,7 @@ import {
     AccuracyResultStorage,
     AccuracyRunStatus,
     AccuracyRunStatuses,
+    ExpectedToolCall,
     ModelResponse,
 } from "./result-storage.js";
 
@@ -74,31 +75,36 @@ export class DiskBasedResultStorage implements AccuracyResultStorage {
         }
     }
 
-    async saveModelResponseForPrompt(
-        commitSHA: string,
-        runId: string,
-        prompt: string,
-        modelResponse: ModelResponse
-    ): Promise<void> {
+    async saveModelResponseForPrompt({
+        commitSHA,
+        runId,
+        prompt,
+        expectedToolCalls,
+        modelResponse,
+    }: {
+        commitSHA: string;
+        runId: string;
+        prompt: string;
+        expectedToolCalls: ExpectedToolCall[];
+        modelResponse: ModelResponse;
+    }): Promise<void> {
+        const initialData: AccuracyResult = {
+            runId,
+            runStatus: AccuracyRunStatus.InProgress,
+            createdOn: Date.now(),
+            commitSHA,
+            promptResults: [
+                {
+                    prompt,
+                    expectedToolCalls,
+                    modelResponses: [modelResponse],
+                },
+            ],
+        };
         const resultFilePath = this.getAccuracyResultFilePath(commitSHA, runId);
         const { fileCreatedWithInitialData } = await this.ensureAccuracyResultFile(
             resultFilePath,
-            JSON.stringify(
-                {
-                    runId,
-                    runStatus: AccuracyRunStatus.InProgress,
-                    createdOn: Date.now(),
-                    commitSHA,
-                    promptResults: [
-                        {
-                            prompt,
-                            modelResponses: [modelResponse],
-                        },
-                    ],
-                },
-                null,
-                2
-            )
+            JSON.stringify(initialData, null, 2)
         );
 
         if (fileCreatedWithInitialData) {
@@ -124,6 +130,7 @@ export class DiskBasedResultStorage implements AccuracyResultStorage {
                                 ...accuracyResult.promptResults,
                                 {
                                     prompt,
+                                    expectedToolCalls,
                                     modelResponses: [modelResponse],
                                 },
                             ],
@@ -136,6 +143,7 @@ export class DiskBasedResultStorage implements AccuracyResultStorage {
 
             accuracyResult.promptResults.splice(existingPromptIdx, 1, {
                 prompt: promptResult.prompt,
+                expectedToolCalls: promptResult.expectedToolCalls,
                 modelResponses: [...promptResult.modelResponses, modelResponse],
             });
 
diff --git a/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/mongodb-storage.ts
@@ -4,6 +4,7 @@ import {
     AccuracyResultStorage,
     AccuracyRunStatus,
     AccuracyRunStatuses,
+    ExpectedToolCall,
     ModelResponse,
 } from "./result-storage.js";
 
@@ -48,12 +49,19 @@ export class MongoDBBasedResultStorage implements AccuracyResultStorage {
         );
     }
 
-    async saveModelResponseForPrompt(
-        commitSHA: string,
-        runId: string,
-        prompt: string,
-        modelResponse: ModelResponse
-    ): Promise<void> {
+    async saveModelResponseForPrompt({
+        commitSHA,
+        runId,
+        prompt,
+        expectedToolCalls,
+        modelResponse,
+    }: {
+        commitSHA: string;
+        runId: string;
+        prompt: string;
+        expectedToolCalls: ExpectedToolCall[];
+        modelResponse: ModelResponse;
+    }): Promise<void> {
         const savedModelResponse: ModelResponse = { ...modelResponse };
         for (const field of this.omittedModelResponseFields) {
             delete savedModelResponse[field];
@@ -81,7 +89,7 @@ export class MongoDBBasedResultStorage implements AccuracyResultStorage {
             },
             {
                 $push: {
-                    promptResults: { prompt, modelResponses: [] },
+                    promptResults: { prompt, expectedToolCalls, modelResponses: [] },
                 },
             }
         );
diff --git a/tests/accuracy/sdk/accuracy-result-storage/result-storage.ts b/tests/accuracy/sdk/accuracy-result-storage/result-storage.ts
@@ -42,6 +42,10 @@ export interface PromptResult {
     /**
      * The actual prompt that was provided to LLM as test */
     prompt: string;
+    /**
+     * A list of tools, along with their parameters, that are expected to be
+     * called by the LLM in test. */
+    expectedToolCalls: ExpectedToolCall[];
     /**
      * The responses from the LLMs tested, when provided with the prompt. */
     modelResponses: ModelResponse[];
@@ -65,10 +69,6 @@ export interface ModelResponse {
      * were called by LLM when responding to the provided prompts. To know more
      * about how this number is generated, check - toolCallingAccuracy.ts */
     toolCallingAccuracy: number;
-    /**
-     * A list of tools, along with their parameters, that are expected to be
-     * called by the LLM in test. */
-    expectedToolCalls: ExpectedToolCall[];
     /**
      * A list of tools, along with their parameters, that were actually called
      * by the LLM in test. */
@@ -106,11 +106,12 @@ export interface AccuracyResultStorage {
     /**
      * Attempts to atomically insert the model response for the prompt in the
      * stored accuracy result. */
-    saveModelResponseForPrompt(
-        commitSHA: string,
-        runId: string,
-        prompt: string,
-        modelResponse: ModelResponse
-    ): Promise<void>;
+    saveModelResponseForPrompt(data: {
+        commitSHA: string;
+        runId: string;
+        prompt: string;
+        expectedToolCalls: ExpectedToolCall[];
+        modelResponse: ModelResponse;
+    }): Promise<void>;
     close(): Promise<void>;
 }
diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts
@@ -102,17 +102,22 @@ export function describeAccuracyTests(models: TestableModels, accuracyTestConfig
             const toolCallingAccuracy = calculateToolCallingAccuracy(testConfig.expectedToolCalls, llmToolCalls);
 
             const responseTime = timeAfterPrompt - timeBeforePrompt;
-            await accuracyResultStorage.saveModelResponseForPrompt(commitSHA, accuracyRunId, testConfig.prompt, {
-                provider: model.provider,
-                requestedModel: model.modelName,
-                respondingModel: result.respondingModel,
-                llmResponseTime: responseTime,
-                toolCallingAccuracy: toolCallingAccuracy,
+            await accuracyResultStorage.saveModelResponseForPrompt({
+                commitSHA,
+                runId: accuracyRunId,
+                prompt: testConfig.prompt,
                 expectedToolCalls: testConfig.expectedToolCalls,
-                llmToolCalls: llmToolCalls,
-                tokensUsed: result.tokensUsage,
-                text: result.text,
-                messages: result.messages,
+                modelResponse: {
+                    provider: model.provider,
+                    requestedModel: model.modelName,
+                    respondingModel: result.respondingModel,
+                    llmResponseTime: responseTime,
+                    toolCallingAccuracy: toolCallingAccuracy,
+                    llmToolCalls: llmToolCalls,
+                    tokensUsed: result.tokensUsage,
+                    text: result.text,
+                    messages: result.messages,
+                },
             });
         });
     });