eval.js | dev-skills

raw
#!/usr/bin/env node
/**
 * =============================================================================
 * eval.js — Eval runner with built-in grading and report generation
 * =============================================================================
 *
 * Runs evals defined in *.eval.md files. Parses the test definition, runs
 * sessions (single-turn or multi-turn with a simulated user), grades
 * transcripts against acceptance criteria, and writes a report.
 *
 * USAGE
 *   node eval.js --eval path/to/file.eval.md --skill path/to/skill-dir
 *   node eval.js --eval path/to/file.eval.md --test "1.1" --skill path/to/skill
 *
 * FLAGS
 *   --eval PATH     Eval definition file (required)
 *   --test ID       Run only the test with this ID (default: all tests)
 *   --skill PATH    Skill directory (repeatable, must contain SKILL.md)
 *   --runs N        Parallel independent sessions per test (default: 3)
 *   --output PATH   JSON transcripts file (default: {name}.eval.{ts}.json)
 *   --help          Show this help message
 *
 * OUTPUT
 *   Report: {name}.eval.{timestamp}.md alongside the eval file.
 *   JSON:   {name}.eval.{timestamp}.json alongside the eval file (or --output).
 *   Logs:   Progress to stderr.
 *
 * =============================================================================
 */

import { spawn } from "node:child_process";
import {
  existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync,
  symlinkSync, writeFileSync,
} from "node:fs";
import { randomUUID } from "node:crypto";
import { basename, dirname, join, resolve } from "node:path";
import { tmpdir } from "node:os";

// ── Regular expressions ─────────────────────────────────────────────────────

const JSON_ARRAY_RE = /\[[\s\S]*\]/;
const TIMESTAMP_SEPARATORS_RE = /[:.]/g;

// Unset CLAUDECODE so child claude processes don't detect a nested session
// and refuse to launch.
delete process.env.CLAUDECODE;

// ── CLI argument parsing ────────────────────────────────────────────────────

const HELP = `\
Usage: node eval.js --eval path/to/file.eval.md [options]

Options:
  --eval PATH     Eval definition file (required)
  --test ID       Run only the test with this ID (default: all tests)
  --skill PATH    Skill directory (repeatable)
  --runs N        Parallel independent sessions per test (default: 3)
  --output PATH   JSON transcripts file (default: {name}.eval.{ts}.json)
  --help          Show this help message`;

const args = process.argv.slice(2);
let runs = 3;
const skillPaths = [];
let evalPath = "";
let testId = "";
let outputPath = "";

let i = 0;
while (i < args.length) {
  const arg = args[i];
  i++;
  if (arg === "--help") {
    console.log(HELP);
    process.exit(0);
  }
  if (arg === "--eval") {
    evalPath = args[i];
    i++;
    continue;
  }
  if (arg === "--test") {
    testId = args[i];
    i++;
    continue;
  }
  if (arg === "--runs") {
    runs = Number(args[i]);
    i++;
    continue;
  }
  if (arg === "--skill") {
    skillPaths.push(args[i]);
    i++;
    continue;
  }
  if (arg === "--output") {
    outputPath = args[i];
    i++;
    continue;
  }
}

if (!evalPath) {
  console.error("Error: --eval is required.\n");
  console.error(HELP);
  process.exit(1);
}

// Default outputPath is set after evalBasename is known (see below).

// ── Eval file extraction ────────────────────────────────────────────────────

/**
 * Send the eval file to an LLM and get back structured test definitions.
 *
 * No regex parsing — the LLM reads whatever format the author used and
 * returns a JSON array of test objects.
 */
async function extractTests(filePath) {
  const content = readFileSync(resolve(filePath), "utf-8");
  
  const prompt = [
    "Read this eval definition file and extract every test as JSON.",
    "",
    "Return a JSON array of test objects. Each test must have:",
    '  "id": string — test identifier (e.g. "1.1")',
    '  "name": string — short test name',
    '  "description": string — what the test evaluates',
    '  "openingPrompt": string — the first message sent to the agent',
    '  "userBriefing": string — simulated user briefing ("" if single-turn)',
    '  "maxTurns": number — turn limit (1 if single-turn, 6 default)',
    '  "criteria": string[] — acceptance criteria',
    '  "groundTruth": string[] — ground truth facts (may be empty)',
    "",
    "Output only valid JSON, nothing else.",
    "",
    "---",
    "",
    content,
  ].join("\n");
  
  const stdout = await exec("claude", [
    "-p",
    "--output-format", "json",
    "--max-turns", "1",
    prompt,
  ]);
  const text = parseResult(stdout);
  
  const jsonMatch = text.match(JSON_ARRAY_RE);
  if (!jsonMatch) {
    console.error("Error: could not extract tests from eval file.");
    console.error("LLM response:", text.slice(0, 500));
    process.exit(1);
  }
  
  return JSON.parse(jsonMatch[0]);
}

// ── Isolated eval environment ───────────────────────────────────────────────

const evalHome = mkdtempSync(join(tmpdir(), "eval-"));
const evalSkills = join(evalHome, ".claude", "skills");
mkdirSync(evalSkills, { recursive: true });

for (const skillPath of skillPaths) {
  const resolved = resolve(skillPath);
  if (!existsSync(join(resolved, "SKILL.md"))) {
    console.error(
      `Error: ${resolved} is not a skill directory (no SKILL.md)`,
    );
    process.exit(1);
  }
  symlinkSync(resolved, join(evalSkills, basename(resolved)));
}

function cleanup() {
  try {
    rmSync(evalHome, { recursive: true, force: true });
  } catch {
    // best effort
  }
}

process.on("exit", cleanup);
process.on("SIGINT", () => { cleanup(); process.exit(130); });
process.on("SIGTERM", () => { cleanup(); process.exit(143); });

// ── Helpers ─────────────────────────────────────────────────────────────────

/**
 * Spawn a command, collect stdout, and resolve when it exits.
 * Uses detached: true so child claude processes are not killed by process
 * group management when this script runs inside Claude Code.
 */
function exec(cmd, cmdArgs, opts = {}) {
  return new Promise((resolve, reject) => {
    const child = spawn(cmd, cmdArgs, {
      stdio: ["ignore", "pipe", "pipe"],
      detached: true,
      ...opts,
    });
    let stdout = "";
    let stderr = "";
    child.stdout.on("data", (chunk) => (stdout += chunk));
    child.stderr.on("data", (chunk) => (stderr += chunk));
    child.on("error", reject);
    child.on("exit", (code, signal) => {
      if (signal) {
        return reject(new Error(`${cmd} killed by ${signal}`));
      }
      if (code !== 0) {
        return reject(
          new Error(`${cmd} exited ${code}: ${stderr.slice(0, 500)}`),
        );
      }
      resolve(stdout);
    });
  });
}

/** Parse the JSON result from `claude -p --output-format json`. */
function parseResult(stdout) {
  try {
    const parsed = JSON.parse(stdout);
    return parsed.result || stdout;
  } catch {
    return stdout;
  }
}

// ── Agent under test ────────────────────────────────────────────────────────

/**
 * Send a prompt to the agent under test and return its text response.
 *
 * Turn 1 creates a new session with --session-id.
 * Turns 2+ resume the session with --resume.
 * Skills are discovered from <cwd>/.claude/skills/ (project-level).
 * --setting-sources project blocks user-level skills from ~/.claude/.
 */
async function agentTurn(prompt, sessionId, { resume = false } = {}) {
  const claudeArgs = [
    "-p",
    "--output-format", "json",
    "--setting-sources", "project",
    "--permission-mode", "bypassPermissions",
  ];
  
  if (resume) {
    claudeArgs.push("--resume", sessionId);
  } else {
    claudeArgs.push("--session-id", sessionId);
  }
  
  claudeArgs.push(prompt);
  
  const stdout = await exec("claude", claudeArgs, {
    cwd: evalHome,
  });
  return parseResult(stdout);
}

// ── Simulated user ──────────────────────────────────────────────────────────

/**
 * Generate the simulated user's next message via an LLM call.
 *
 * Reads the full transcript plus the user briefing and produces a natural
 * follow-up message (1–3 sentences). When the user's goals are
 * fully achieved, it responds with [DONE] to end the session early.
 */
async function userTurn(transcript, briefing) {
  const formatted = transcript
    .map((entry) =>
      entry.role === "user" ?
        `User: ${entry.message}` :
        `Agent: ${entry.message.slice(0, 2000)}`,
    )
    .join("\n\n");

  const prompt = [
    "You are a user interacting with an AI agent.",
    "",
    "Your briefing (goals, knowledge, personality):",
    briefing,
    "",
    "Conversation so far:",
    formatted,
    "",
    "Write the user's next message (1–3 sentences). Be natural and",
    "conversational. Adapt to what the agent actually said.",
    "",
    "If your goals from the briefing have been fully achieved, respond",
    "with exactly: [DONE]",
    "",
    "Output only the user's message, nothing else.",
  ].join("\n");
  
  const stdout = await exec("claude", [
    "-p",
    "--output-format", "json",
    "--max-turns", "1",
    prompt,
  ]);
  return parseResult(stdout);
}

// ── Session runner ──────────────────────────────────────────────────────────

/** Run a single conversation session for a test. */
async function runSession(test, runIndex) {
  const sessionId = randomUUID();
  const transcript = [];
  const briefing = test.userBriefing || "Accept whatever the agent says.";
  
  let turn = 0;
  while (turn < test.maxTurns) {
    turn++;
    let userMessage;
    
    if (turn === 1) {
      userMessage = test.openingPrompt;
    } else {
      console.error(
        `  [${test.id}] Run ${runIndex + 1} / Turn ${turn}: simulated user...`,
      );
      userMessage = await userTurn(transcript, briefing);
      
      if (userMessage.includes("[DONE]")) {
        console.error(
          `  [${test.id}] Run ${runIndex + 1} / Turn ${turn}: user goals met.`,
        );
        break;
      }
    }
    
    transcript.push({ role: "user", turn, message: userMessage });
    
    console.error(
      `  [${test.id}] Run ${runIndex + 1} / Turn ${turn}: agent...`,
    );
    const agentResponse = await agentTurn(userMessage, sessionId, {
      resume: turn > 1,
    });
    transcript.push({
      role: "agent", turn, message: agentResponse,
    });
  }
  
  return { run: runIndex + 1, sessionId, transcript };
}

// ── Grading ─────────────────────────────────────────────────────────────────

/**
 * Grade a single run's transcript against acceptance criteria.
 *
 * Calls an LLM to evaluate each criterion. Returns an object with per-
 * criterion grades and an overall run grade (PASS / PARTIAL / FAIL).
 */
async function gradeRun(transcript, criteria, groundTruth) {
  const formatted = transcript
    .map((entry) =>
      entry.role === "user" ?
        `User: ${entry.message}` :
        `Agent: ${entry.message.slice(0, 4000)}`,
    )
    .join("\n\n");

  const criteriaList = criteria
    .map((criterion, index) => `${index + 1}. ${criterion}`)
    .join("\n");
  
  const parts = [
    "You are grading an AI agent's performance in a conversation.",
    "",
    "## Acceptance criteria",
    criteriaList,
  ];
  
  if (groundTruth.length > 0) {
    parts.push(
      "",
      "## Ground truth (facts the agent should identify)",
      groundTruth.map((fact) => `- ${fact}`).join("\n"),
    );
  }
  
  parts.push(
    "",
    "## Transcript",
    formatted,
    "",
    "## Instructions",
    "For each acceptance criterion, evaluate whether the conversation met it.",
    "Output a JSON array with one object per criterion:",
    '  { "criterion": "...", "grade": "pass" | "fail"' +
    ' | "partial", "reason": "brief explanation" }',
    "",
    "Output only valid JSON, nothing else.",
  );
  
  const stdout = await exec("claude", [
    "-p",
    "--output-format", "json",
    "--max-turns", "1",
    parts.join("\n"),
  ]);
  const text = parseResult(stdout);
  
  // Extract JSON from the response (may be wrapped in markdown fences).
  const jsonMatch = text.match(JSON_ARRAY_RE);
  if (!jsonMatch) {
    console.error("  Warning: could not parse grading response as JSON.");
    return {
      grades: criteria.map((criterion) => ({
        criterion, grade: "fail", reason: "grading parse error",
      })),
      result: "FAIL",
    };
  }
  
  const grades = JSON.parse(jsonMatch[0]);
  
  // Compute run result.
  const passCount = grades.filter((grade) => grade.grade === "pass").length;
  const failCount = grades.filter((grade) => grade.grade === "fail").length;
  let result;
  if (failCount === 0) {
    result = "PASS";
  } else if (passCount === 0) {
    result = "FAIL";
  } else {
    result = "PARTIAL";
  }
  
  return { grades, result };
}

/** Compute verdict across runs: PASS (3/3), FLAKY (2/3), FAIL (0–1/3). */
function computeVerdict(runGrades) {
  const passCount = runGrades.filter((grade) => grade.result === "PASS").length;
  if (passCount === runGrades.length) return "PASS";
  if (passCount >= 2) return "FLAKY";
  return "FAIL";
}

// ── Report generation ───────────────────────────────────────────────────────

/** Get git metadata for the report header. */
async function gitMeta() {
  let commitHash = "unknown";
  let uncommitted = "unknown";
  try {
    commitHash = (await exec("git", ["rev-parse", "--short", "HEAD"])).trim();
  } catch { /* not a git repo */ }
  try {
    const status = (await exec("git", ["status", "--porcelain"])).trim();
    const count = status.split("\n").length;
    uncommitted = status ?
      count + " files changed" : "none";
  } catch { /* not a git repo */ }
  return { commitHash, uncommitted };
}

/** Generate the markdown report. */
function generateReport(evalName, testResults, meta, timestamp) {
  const lines = [
    `# Eval Run — ${evalName} — ${timestamp}`,
    "",
    `**Commit:** ${meta.commitHash}`,
    `**Uncommitted changes:** ${meta.uncommitted}`,
  ];
  
  for (const { test, runGrades, verdict } of testResults) {
    lines.push("", "---", "");
    lines.push(
      `### ${test.id} — ${test.name}: ${verdict}`,
    );
    lines.push("");

    // Use majority grades across runs for the criteria display.
    for (const [criterionIndex, criterion] of test.criteria.entries()) {
      const gradeValues = runGrades.map((runGrade) => {
        const grade = runGrade.grades[criterionIndex];
        return grade ? grade.grade : "fail";
      });
      const passes = gradeValues
        .filter((value) => value === "pass").length;
      const partials = gradeValues
        .filter((value) => value === "partial").length;
      let mark;
      if (passes > gradeValues.length / 2) {
        mark = "x";
      } else if (passes + partials > gradeValues.length / 2) {
        mark = "~";
      } else {
        mark = " ";
      }
      lines.push(`- [${mark}] ${criterion}`);
    }

    lines.push("");
    const perRun = runGrades
      .map((runGrade, index) => `Run ${index + 1}: ${runGrade.result}`)
      .join(" | ");
    lines.push(perRun);
  }
  
  // Summary table.
  lines.push("", "---", "", "## Summary", "");
  lines.push(
    "| #   | Test | " +
    Array.from({ length: runs }, (_, index) => `Run ${index + 1}`).join(" | ") +
    " | Verdict |",
  );
  lines.push(
    "| --- | ---- | " +
    Array.from({ length: runs }, () => "-----").join(" | ") +
    " | ------- |",
  );
  for (const { test, runGrades, verdict } of testResults) {
    const runCols = runGrades
      .map((runGrade) => runGrade.result).join(" | ");
    lines.push(
      `| ${test.id} | ${test.name}` +
      ` | ${runCols} | ${verdict} |`,
    );
  }

  const passCount = testResults
    .filter((result) => result.verdict === "PASS").length;
  const flakyCount = testResults
    .filter((result) => result.verdict === "FLAKY").length;
  const failCount = testResults
    .filter((result) => result.verdict === "FAIL").length;
  const parts = [];
  if (passCount) parts.push(`${passCount} passed`);
  if (flakyCount) parts.push(`${flakyCount} flaky`);
  if (failCount) parts.push(`${failCount} failed`);
  lines.push("", `${testResults.length} tests: ${parts.join(", ")}`);
  
  return lines.join("\n") + "\n";
}

// ── Main ────────────────────────────────────────────────────────────────────

const resolvedEval = resolve(evalPath);
const allTests = await extractTests(resolvedEval);

if (allTests.length === 0) {
  console.error(`Error: no tests found in ${evalPath}`);
  process.exit(1);
}

const tests = testId ?
  allTests.filter((test) => test.id === testId) :
  allTests;

if (tests.length === 0) {
  console.error(`Error: no test with ID "${testId}" in ${evalPath}`);
  process.exit(1);
}

const evalBasename = basename(evalPath, ".eval.md");
const evalDir = dirname(resolvedEval);
const skillNames = skillPaths.map((path) => basename(resolve(path)));
const timestamp = new Date().toISOString()
  .replace(TIMESTAMP_SEPARATORS_RE, "-");

if (!outputPath) {
  outputPath = join(evalDir, `${evalBasename}.eval.${timestamp}.json`);
}

console.error(`Eval: ${evalPath}`);
const testIds = tests.map((test) => test.id).join(", ");
console.error(
  `Tests: ${testIds} (${tests.length} of ${allTests.length})`,
);
console.error(`Runs: ${runs} per test`);
console.error(`Skills: ${skillNames.join(", ") || "(none)"}`);
console.error(`Eval home: ${evalHome}`);
console.error(`Output: ${outputPath}\n`);

const testResults = [];

for (const test of tests) {
  console.error(`\n── Test ${test.id} — ${test.name} ──`);
  console.error(`   ${test.description}`);
  const mode = test.maxTurns === 1 ?
    "Single-turn" :
    `Multi-turn (up to ${test.maxTurns} turns)`;
  console.error(`   ${mode}`);
  
  // Run N sessions in parallel.
  const sessionResults = await Promise.all(
    Array.from({ length: runs }, (_, runIndex) => runSession(test, runIndex)),
  );
  
  // Grade each run.
  console.error(`  Grading ${runs} runs...`);
  const runGrades = [];
  for (const [runIndex, session] of sessionResults.entries()) {
    console.error(`  Grading run ${runIndex + 1}...`);
    const grade = await gradeRun(
      session.transcript,
      test.criteria,
      test.groundTruth,
    );
    runGrades.push(grade);
    console.error(`  Run ${runIndex + 1}: ${grade.result}`);
  }
  
  const verdict = computeVerdict(runGrades);
  console.error(`  Verdict: ${verdict}`);
  
  testResults.push({
    test,
    sessions: sessionResults,
    runGrades,
    verdict,
  });
}

// Write report.
const meta = await gitMeta();
const report = generateReport(evalBasename, testResults, meta, timestamp);
const reportPath = join(evalDir, `${evalBasename}.eval.${timestamp}.md`);
writeFileSync(reportPath, report);
console.error(`\nReport: ${reportPath}`);

// Write JSON transcripts + grades.
const jsonOutput = testResults.map((testResult) => ({
  testId: testResult.test.id,
  testName: testResult.test.name,
  verdict: testResult.verdict,
  runs: testResult.sessions.map((session, index) => ({
    run: session.run,
    sessionId: session.sessionId,
    transcript: session.transcript,
    grades: testResult.runGrades[index].grades,
    result: testResult.runGrades[index].result,
  })),
}));
writeFileSync(outputPath, JSON.stringify(jsonOutput, null, 2) + "\n");
console.error(`JSON: ${outputPath}`);