feat: add promptfoo eval harness for agent quality scoring (#371)

Adds promptfoo eval harness for agent quality scoring. LLM-as-judge system scoring task completion, instruction adherence, identity consistency, deliverable quality, and safety. Includes tests.
2026-04-10 21:54:31 -05:00
parent 1e73b5be0d
commit b456845e85
11 changed files with 796 additions and 0 deletions
@@ -0,0 +1,65 @@
+import { describe, it, expect } from "vitest";
+import { extractMetrics, parseAgentFile } from "./extract-metrics";
+import path from "path";
+
+describe("parseAgentFile", () => {
+  it("extracts frontmatter fields from a real agent file", () => {
+    const agentPath = path.resolve(
+      __dirname,
+      "../../engineering/engineering-backend-architect.md"
+    );
+    const result = parseAgentFile(agentPath);
+
+    expect(result.name).toBe("Backend Architect");
+    expect(result.description).toContain("backend architect");
+    expect(result.category).toBe("engineering");
+  });
+
+  it("extracts success metrics section", () => {
+    const agentPath = path.resolve(
+      __dirname,
+      "../../engineering/engineering-backend-architect.md"
+    );
+    const result = parseAgentFile(agentPath);
+
+    expect(result.successMetrics).toBeDefined();
+    expect(result.successMetrics!.length).toBeGreaterThan(0);
+    expect(result.successMetrics!.some((m) => m.includes("200ms"))).toBe(true);
+  });
+
+  it("extracts critical rules section", () => {
+    const agentPath = path.resolve(
+      __dirname,
+      "../../academic/academic-historian.md"
+    );
+    const result = parseAgentFile(agentPath);
+
+    expect(result.criticalRules).toBeDefined();
+    expect(result.criticalRules!.length).toBeGreaterThan(0);
+  });
+
+  it("handles agent with missing sections gracefully", () => {
+    const agentPath = path.resolve(
+      __dirname,
+      "../../engineering/engineering-backend-architect.md"
+    );
+    const result = parseAgentFile(agentPath);
+
+    expect(result).toHaveProperty("name");
+    expect(result).toHaveProperty("category");
+    expect(result).toHaveProperty("successMetrics");
+    expect(result).toHaveProperty("criticalRules");
+    expect(result).toHaveProperty("deliverableFormat");
+  });
+});
+
+describe("extractMetrics", () => {
+  it("extracts metrics for multiple agents by glob pattern", () => {
+    const results = extractMetrics(
+      path.resolve(__dirname, "../../engineering/engineering-backend-architect.md")
+    );
+
+    expect(results.length).toBe(1);
+    expect(results[0].name).toBe("Backend Architect");
+  });
+});
@@ -0,0 +1,127 @@
+import fs from "fs";
+import path from "path";
+import matter from "gray-matter";
+import { globSync } from "glob";
+
+export interface AgentMetrics {
+  name: string;
+  description: string;
+  category: string;
+  filePath: string;
+  successMetrics: string[] | null;
+  criticalRules: string[] | null;
+  deliverableFormat: string | null;
+}
+
+/**
+ * Parse a single agent markdown file and extract structured metrics.
+ */
+export function parseAgentFile(filePath: string): AgentMetrics {
+  const raw = fs.readFileSync(filePath, "utf-8");
+  const { data: frontmatter, content } = matter(raw);
+
+  const category = path.basename(path.dirname(filePath));
+
+  return {
+    name: frontmatter.name || path.basename(filePath, ".md"),
+    description: frontmatter.description || "",
+    category,
+    filePath,
+    successMetrics: extractSection(content, "Success Metrics"),
+    criticalRules: extractSection(content, "Critical Rules"),
+    deliverableFormat: extractRawSection(content, "Technical Deliverables"),
+  };
+}
+
+/**
+ * Extract bullet points from a markdown section by heading text.
+ * Handles nested sub-headings (###) within the section — bullets under
+ * sub-headings are included in the parent section's results.
+ */
+function extractSection(content: string, sectionName: string): string[] | null {
+  const lines = content.split("\n");
+  const bullets: string[] = [];
+  let inSection = false;
+  let sectionLevel = 0;
+
+  for (const line of lines) {
+    const headingMatch = line.match(/^(#{1,4})\s/);
+
+    const headingText = line.replace(/^#{1,4}\s+/, "").replace(/[\p{Emoji_Presentation}\p{Emoji}\uFE0F]/gu, "").trim().toLowerCase();
+    if (headingMatch && headingText.includes(sectionName.toLowerCase())) {
+      inSection = true;
+      sectionLevel = headingMatch[1].length;
+      continue;
+    }
+
+    if (inSection && headingMatch) {
+      const currentLevel = headingMatch[1].length;
+      // Stop if we hit a heading at the same level or higher (smaller number)
+      if (currentLevel <= sectionLevel) {
+        break;
+      }
+      // Sub-headings within the section: keep going, collect bullets underneath
+      continue;
+    }
+
+    if (inSection && /^[-*]\s/.test(line.trim())) {
+      const bullet = line.trim().replace(/^[-*]\s+/, "").trim();
+      if (bullet.length > 0) {
+        bullets.push(bullet);
+      }
+    }
+  }
+
+  return bullets.length > 0 ? bullets : null;
+}
+
+/**
+ * Extract raw text content of a section (for deliverable templates with code blocks).
+ */
+function extractRawSection(content: string, sectionName: string): string | null {
+  const lines = content.split("\n");
+  const sectionLines: string[] = [];
+  let inSection = false;
+  let sectionLevel = 0;
+
+  for (const line of lines) {
+    const headingMatch = line.match(/^(#{1,4})\s/);
+
+    const headingText = line.replace(/^#{1,4}\s+/, "").replace(/[\p{Emoji_Presentation}\p{Emoji}\uFE0F]/gu, "").trim().toLowerCase();
+    if (headingMatch && headingText.includes(sectionName.toLowerCase())) {
+      inSection = true;
+      sectionLevel = headingMatch[1].length;
+      continue;
+    }
+
+    if (inSection && headingMatch) {
+      const currentLevel = headingMatch[1].length;
+      if (currentLevel <= sectionLevel) {
+        break;
+      }
+    }
+
+    if (inSection) {
+      sectionLines.push(line);
+    }
+  }
+
+  const text = sectionLines.join("\n").trim();
+  return text.length > 0 ? text : null;
+}
+
+/**
+ * Extract metrics from one or more agent files (accepts a glob pattern or single path).
+ */
+export function extractMetrics(pattern: string): AgentMetrics[] {
+  const files = globSync(pattern);
+  return files.map(parseAgentFile);
+}
+
+// CLI entrypoint
+if (require.main === module) {
+  const pattern = process.argv[2] || path.resolve(__dirname, "../../*/*.md");
+  const results = extractMetrics(pattern);
+  console.log(JSON.stringify(results, null, 2));
+  console.error(`Extracted metrics for ${results.length} agents`);
+}