feat: add promptfoo eval harness for agent quality scoring (#371)
Adds promptfoo eval harness for agent quality scoring. LLM-as-judge system scoring task completion, instruction adherence, identity consistency, deliverable quality, and safety. Includes tests.
This commit is contained in:
24
evals/package.json
Normal file
24
evals/package.json
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"name": "agency-agents-evals",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"description": "Evaluation harness for agency-agents specialist prompts",
|
||||
"scripts": {
|
||||
"eval": "promptfoo eval",
|
||||
"eval:view": "promptfoo view",
|
||||
"eval:cache-clear": "promptfoo cache clear",
|
||||
"extract": "ts-node scripts/extract-metrics.ts",
|
||||
"test": "vitest run",
|
||||
"test:watch": "vitest"
|
||||
},
|
||||
"dependencies": {
|
||||
"gray-matter": "^4.0.3",
|
||||
"promptfoo": "^0.121.3"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^22.0.0",
|
||||
"ts-node": "^10.9.0",
|
||||
"typescript": "^5.7.0",
|
||||
"vitest": "^3.0.0"
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user