From 72bd744664f28cc4c1b1ed856558096f6fe32e5a Mon Sep 17 00:00:00 2001 From: hansheinemann Date: Mon, 16 Mar 2026 15:51:14 -0400 Subject: [PATCH] docs: add design doc and buildspec (#5) --- docs/buildspec.md | 437 ++++++++++++++++++++++++++++++++++++++++++++++ docs/design.md | 208 ++++++++++++++++++++++ 2 files changed, 645 insertions(+) create mode 100644 docs/buildspec.md create mode 100644 docs/design.md diff --git a/docs/buildspec.md b/docs/buildspec.md new file mode 100644 index 0000000..dbabb73 --- /dev/null +++ b/docs/buildspec.md @@ -0,0 +1,437 @@ +# Tiered Agent Team System — Build Spec + +_Started: 2026-03-15. Status: Pre-build._ +_See agent-teams-design.md for the design doc and decisions log._ + +--- + +## Language & Runtime + +**Python 3.11+.** Reasons: +- Agent/AI tooling is Python-first +- Clean type hints + dataclasses for schemas +- Agents can read and modify their own orchestration code +- Runs anywhere — no Node, no OpenClaw dependency + +--- + +## Repository + +Standalone repo: `git@github.com:coding-with-hans-heinemann/the-agency.git` + +Separate from the OpenClaw workspace. OpenClaw workspace gets a thin integration layer that calls into it. Core is portable and runnable without OpenClaw. + +--- + +## Directory Structure + +``` +agent-teams/ +├── core/ +│ ├── team_runner.py — run lifecycle, agent spawning +│ ├── blackboard.py — SQLite coordination state +│ ├── task_brief.py — schema + validation +│ └── escalation.py — retry logic, failure routing +│ +├── adapters/ +│ ├── base/ +│ │ ├── llm.py — abstract LLM interface +│ │ ├── vcs.py — abstract VCS interface +│ │ ├── notify.py — abstract notification interface +│ │ └── runtime.py — abstract agent runtime interface +│ ├── llm/ +│ │ ├── anthropic.py — Claude via OpenClaw or direct API +│ │ ├── openai.py — GPT / o-series +│ │ └── ollama.py — local models +│ ├── vcs/ +│ │ └── github.py +│ ├── notify/ +│ │ └── openclaw.py — messages Hans who notifies Andrew +│ └── runtime/ +│ ├── openclaw.py — sessions_spawn (general purpose) +│ └── claude_code.py — coding agent runtime (file/git/exec tools) +│ +├── agents/ — git submodule: msitarzewski/agency-agents +│ ├── engineering/ +│ ├── testing/ +│ ├── strategy/ +│ └── ... — full agency-agents roster +│ +├── prompts/ +│ ├── t1_visionary.md — fallback if no agent_personality set +│ ├── t2_architect.md +│ ├── t3_squad_lead.md +│ ├── t4_implementer.md +│ └── t5_verifier.md +│ +├── config/ +│ ├── team.yaml — example run configuration +│ └── role_registry.yaml — maps (tier, domain) → agent personality file +│ +├── runs/ — runtime state, one subdir per run_id +│ └── .gitkeep +│ +└── README.md +``` + +--- + +## Blackboard + +SQLite. One file per run at `runs//blackboard.db`. + +### Tables + +**runs** +```sql +CREATE TABLE runs ( + run_id TEXT PRIMARY KEY, + goal TEXT NOT NULL, + status TEXT NOT NULL, -- pending | active | review | done | failed + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL +); +``` + +**workstreams** +```sql +CREATE TABLE workstreams ( + workstream_id TEXT PRIMARY KEY, + run_id TEXT NOT NULL, + name TEXT NOT NULL, + tier INTEGER NOT NULL, + status TEXT NOT NULL, -- pending | active | blocked | done | failed + owner_agent_id TEXT, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL +); +``` + +**briefs** +```sql +CREATE TABLE briefs ( + brief_id TEXT PRIMARY KEY, + run_id TEXT NOT NULL, + parent_brief_id TEXT, + workstream_id TEXT, + tier INTEGER NOT NULL, + role TEXT NOT NULL, + status TEXT NOT NULL, -- pending | active | done | failed + payload TEXT NOT NULL, -- full JSON brief + result TEXT, -- JSON result when done + retry_count INTEGER DEFAULT 0, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL +); +``` + +**events** +```sql +CREATE TABLE events ( + event_id TEXT PRIMARY KEY, + run_id TEXT NOT NULL, + brief_id TEXT, + kind TEXT NOT NULL, -- spawned | completed | failed | escalated | retried + detail TEXT, -- JSON + created_at TEXT NOT NULL +); +``` + +--- + +## Task Brief Schema + +Every brief passed between tiers is a validated JSON object. `goal_anchor` is immutable — set by T1, copied verbatim into every downstream brief. + +```json +{ + "brief_id": "uuid", + "run_id": "uuid", + "parent_brief_id": "uuid | null", + "tier": 4, + "role": "implementer", + "goal_anchor": "Original T1 intent — always propagated unchanged", + "workstream": "backend-api", + "task": "Implement POST /webhooks/ingest endpoint", + "acceptance_criteria": [ + "Accepts JSON payload", + "Returns 202 on success", + "Writes to queue" + ], + "constraints": [ + "Use existing queue client in src/queue.py", + "No new dependencies" + ], + "context": { + "relevant_files": ["src/routes/webhooks.py", "src/queue.py"], + "interface_contract": "..." + }, + "retry_budget": 3, + "retry_count": 0, + "preferred_runtime": "coding_agent", + "agent_personality": "agents/engineering/engineering-code-reviewer.md", + "created_at": "ISO-8601" +} +``` + +`preferred_runtime` is optional. T3 sets it to `"coding_agent"` when spawning T4/T5 for implementation or verification tasks. Runner falls back to `"standard"` if the coding agent runtime is not configured. + +`agent_personality` is optional. When set, the runtime adapter reads the file and injects its contents as the system prompt at spawn time. Falls back to the generic tier prompt in `prompts/` if not set. + +``` +``` + +--- + +## Adapter Interfaces + +### LLM (`adapters/base/llm.py`) +```python +class LLMAdapter: + def complete(self, prompt: str, capability: str, context: dict) -> str + def resolve_model(self, capability: str) -> str + # capability: "reasoning-heavy" | "capable" | "fast-cheap" +``` + +### VCS (`adapters/base/vcs.py`) +```python +class VCSAdapter: + def create_branch(self, name: str) -> None + def commit(self, files: list[str], message: str) -> str # returns commit sha + def create_pr(self, title: str, body: str, head: str, base: str) -> str # returns pr url + def get_pr_status(self, pr_id: str) -> str # open | merged | closed +``` + +### Notify (`adapters/base/notify.py`) +```python +class NotifyAdapter: + def send(self, message: str, context: dict) -> None +``` + +### Runtime (`adapters/base/runtime.py`) +```python +class RuntimeAdapter: + def spawn(self, task: str, capability: str, context: dict) -> str # returns agent_id + def get_result(self, agent_id: str, timeout_s: int) -> dict + def kill(self, agent_id: str) -> None + +# Two implementations: +# openclaw.py — general purpose, uses sessions_spawn, suits T1/T2/T3 +# claude_code.py — coding-specialized, has file/git/exec tools, suits T4/T5 +# +# The runner selects runtime based on brief.preferred_runtime: +# "standard" → openclaw.py (default) +# "coding_agent" → claude_code.py (falls back to standard if unavailable) +# +# Both implementations inject brief.agent_personality as the system prompt +# when spawning, if present. Falls back to generic tier prompt otherwise. +# claude_code.py passes the agent file via --system-prompt flag natively +# (agency-agents was designed for Claude Code's agents/ directory). +``` + +--- + +## Run Config (`config/team.yaml`) + +```yaml +run: + goal: "Build webhook ingestion system with retry logic and DLQ" + repo: "git@github.com:org/repo.git" + base_branch: "main" + +adapters: + llm: anthropic + vcs: github + notify: openclaw + runtime: openclaw + +models: + provider: anthropic # default provider + capability_map: + reasoning-heavy: + anthropic: claude-opus-4-6 + openai: o3 + capable: + anthropic: claude-sonnet-4-6 + openai: gpt-4o + ollama: llama3.1:70b + fast-cheap: + anthropic: claude-haiku-3-5 + openai: gpt-4o-mini + ollama: llama3.2 + + # optional: override provider per tier + tier_overrides: + t1: { provider: openai, capability: reasoning-heavy } + t4: { provider: ollama, capability: fast-cheap } + +runtime: + default: openclaw + coding_agent: claude_code # used for T4/T5 when available; omit to disable + native_teams: false # Claude Code's experimental agent teams — opt-in only + # when true: T3 hands full workstream to Claude Code, + # which fans out internally. faster but less blackboard + # visibility. default: false (explicit T4 spawning) + # tier_runtime_map (optional overrides): + # t1: standard + # t2: standard + # t3: standard + # t4: coding_agent + # t5: coding_agent + +retry_defaults: + bad_output: 3 + partial: 2 + blocked: 0 # always escalate immediately +``` + +--- + +## Role Registry (`config/role_registry.yaml`) + +Maps `(tier, domain)` → agent personality file. T1 consults this during scope assessment when selecting specialists for each workstream brief. Adding a new specialist means adding one entry here — no core changes. + +```yaml +t1: + default: agents/strategy/nexus-strategy.md + +t2: + backend: agents/engineering/engineering-software-architect.md + frontend: agents/engineering/engineering-software-architect.md + infra: agents/engineering/engineering-devops-automator.md + data: agents/engineering/engineering-data-engineer.md + default: agents/engineering/engineering-software-architect.md + +t3: + backend: agents/engineering/engineering-senior-developer.md + frontend: agents/engineering/engineering-senior-developer.md + infra: agents/engineering/engineering-sre.md + default: agents/engineering/engineering-senior-developer.md + +t4: + frontend: agents/engineering/engineering-frontend-developer.md + backend: agents/engineering/engineering-backend-architect.md + database: agents/engineering/engineering-database-optimizer.md + devops: agents/engineering/engineering-devops-automator.md + mobile: agents/engineering/engineering-mobile-app-builder.md + ai: agents/engineering/engineering-ai-engineer.md + security: agents/engineering/engineering-security-engineer.md + docs: agents/engineering/engineering-technical-writer.md + default: agents/engineering/engineering-senior-developer.md + +t5: + code: agents/engineering/engineering-code-reviewer.md + integration: agents/testing/testing-reality-checker.md + api: agents/testing/testing-api-tester.md + performance: agents/testing/testing-performance-benchmarker.md + security: agents/engineering/engineering-security-engineer.md + default: agents/engineering/engineering-code-reviewer.md +``` + +```yaml +``` + +--- + +## Key Flows + +### 1. Run Kickoff + +``` +User → Hans → team_runner.start(goal, config) + → generate run_id + → init blackboard (create runs//blackboard.db) + → build T1 brief (goal_anchor = goal, retry_budget from config) + → spawn T1 via runtime adapter + → await T1 workplan +``` + +### 2. T1 Scope Assessment + +``` +T1 receives brief + → assess complexity → decide depth + → identify workstreams + → set retry_budget multiplier per workstream (1x simple, 2x complex) + → emit N workstream briefs for T2 (or T3 if shallow) + → write workplan to blackboard + → team_runner spawns T2s in parallel +``` + +### 3. T4 Retry Loop (escalation.py) + +``` +spawn T4 with brief + → receive result + → classify: bad_output | blocked | partial | success + + blocked: + → log event(escalated) + → pass to T3 immediately + + bad_output, retries_remaining: + → amend brief with failure context, increment retry_count + → re-spawn T4 + → log event(retried) + + bad_output, retries_exhausted: + → log event(escalated) + → pass to T3 + + partial: + → write salvageable parts to blackboard + → re-task remainder with new brief + + success: + → write result to blackboard + → log event(completed) + → notify T3 +``` + +### 4. Review Gate + +``` +T1 completes integration + → vcs_adapter.create_pr( + title="[agent-teams] : ", + body="", + head="integration/", + base="main" + ) + → notify_adapter.send( + "Run complete. PR ready for review: ", + context={run_id, goal, workstreams, pr_url} + ) + → blackboard: update run status → "review" + → halt — no auto-merge +``` + +--- + +## Build Order + +1. `git submodule add https://github.com/msitarzewski/agency-agents agents/` — pull the talent pool +2. `config/role_registry.yaml` — map tier+domain → agent personality files +3. `core/task_brief.py` — schema + validation (everything depends on this) +4. `core/blackboard.py` — SQLite store, all table definitions +5. `adapters/base/*` — all four abstract interfaces +6. `adapters/llm/anthropic.py` — first LLM implementation +7. `core/escalation.py` — retry + failure routing logic +8. `adapters/runtime/openclaw.py` — wire up sessions_spawn + personality injection +9. `adapters/runtime/claude_code.py` — coding agent runtime, personality via --system-prompt +10. `core/team_runner.py` — full run lifecycle, runtime + personality selection +11. `prompts/` — fallback tier prompts (used when no agent_personality set) +12. `adapters/vcs/github.py` — PR creation + branch management +13. `adapters/notify/openclaw.py` — Hans notification +14. `config/team.yaml` — example config +15. `README.md` — how to run, how to add adapters, how to extend the roster + +--- + +## Out of Scope (Phase 2) + +- Cost accounting per tier + run rollup +- Parallel workstream progress dashboard +- Additional adapter implementations (GitLab, Slack, OpenAI, Ollama) +- Persistent standing teams +- Web UI for run monitoring diff --git a/docs/design.md b/docs/design.md new file mode 100644 index 0000000..f51061f --- /dev/null +++ b/docs/design.md @@ -0,0 +1,208 @@ +# Tiered Agent Team System — Design Document + +_Started: 2026-03-14. Status: Pre-build, gathering requirements._ + +--- + +## Overview + +A dynamic, hierarchical multi-agent system for software pipelines. Teams assemble on demand, execute, then disband. Inspired by a blend of Hollywood production (dynamic assembly), consulting firms (structured deliverables, hierarchical synthesis), and two-pizza teams (small autonomous squads, clear domain ownership). + +--- + +## Core Principles + +**1. Tiers represent cognitive modes, not org chart levels.** +Each tier thinks differently — strategy, design, coordination, execution, verification. Adding a tier only makes sense if it introduces a genuinely different mode of reasoning. + +**2. Depth is proportional to complexity.** +Not every task needs every tier. A config change might only need T3→T4. A new product needs the full stack. + +**3. Goal anchoring at every level.** +T1's original intent is embedded in every agent's context — not just passed to T2 and forgotten. Every agent knows the end goal even if they only own a slice. + +**4. Artifacts, not summaries.** +Tiers pass structured specs downward (JSON task briefs), not paraphrased prose. Meaning is preserved; format is compressed. + +**5. Verification is bidirectional.** +Lower tiers verify correctness. Upper tiers verify alignment with original intent. Both directions catch different failure modes. + +**6. Provider agnostic.** +The system makes no assumptions about which LLM provider or platform is in use. Tiers reference capability levels, not specific models. All external dependencies are swappable adapters. + +**7. Specialist talent pool.** +Tiers define structure and responsibility. Agent personalities define domain expertise. The two are separate — the same tier can be filled by different specialists depending on the workstream domain. + +--- + +## Tier Definitions + +| Tier | Role | Owns | Capability Level | +|------|------|------|-----------------| +| T1 | Visionary | Goal, constraints, final acceptance, architectural bets | reasoning-heavy | +| T2 | Architect | System design, interface contracts, workstream boundaries | reasoning-heavy / capable | +| T3 | Squad Lead | Workstream delivery, worker coordination, quality gate | capable | +| T4 | Implementer | Atomic task execution (one file, one function, one test) | fast-cheap | +| T5 | Verifier | Validation of T4 output — correctness + intent alignment | capable | + +T5 runs **parallel to T4**, not above it. It's a quality gate, not a management layer. + +Capability levels map to actual models per provider in config — the core system never references a specific model name. + +--- + +## Variable Depth + +``` +Config change T3 → T4 +New feature T2 → T3 → T4 +Major refactor T1 → T2 → T3 → T4 → T5 +New system / product T1 → T2 → T3s (parallel) → T4s → T5s +``` + +T3 assesses scope on receipt. If a task is simple enough, it handles it directly without spawning upward or waiting for T2 sign-off. + +--- + +## Horizontal Scaling Within Tiers + +Each tier can have multiple agents running in parallel: + +``` +T1 (1–2 agents) +├── T2: Backend Architect +│ ├── T3: API Squad Lead +│ │ ├── T4: Worker — endpoint A +│ │ ├── T4: Worker — endpoint B +│ │ └── T5: Verifier +│ └── T3: DB Squad Lead +│ ├── T4: Worker — migrations +│ └── T5: Verifier +├── T2: Frontend Architect +│ └── T3: UI Squad Lead +│ ├── T4: Worker — component X +│ └── T4: Worker — component Y +└── T2: Infra Architect + └── T3: Platform Squad Lead + └── T4: Worker — config / deploy +``` + +--- + +## Shared State + +For software pipelines, **the repo is the primary blackboard**: +- T4 workers commit to feature branches +- T3 leads review and merge to workstream branches +- T2 architects own integration branches +- T1 does final integration and acceptance + +Supplemented by a SQLite coordination store per run tracking in-flight workstreams, handoff artifacts, tier status, and retry counts. + +--- + +## Failure Handling + +| Failure | Handler | Action | +|---------|---------|--------| +| T4 bad output | T3 | Retry T4 with corrected brief (up to retry_budget) | +| T4 blocked | T3 | Escalate immediately — no retries | +| T4 partial output | T3 | Salvage good parts, re-task remainder | +| T3 workstream stuck | T2 | Re-scope or split the workstream | +| T2 design wrong | T1 | Re-plan; may discard workstream and restart | +| Repeated escalation | Surface to user | Block until human unblocks | + +Retry limits prevent infinite loops. Escalation path is always upward, never sideways. + +--- + +## Agent Talent Pool + +The system builds on [agency-agents](https://github.com/msitarzewski/agency-agents) — a library of 50+ pre-built specialist personalities, each with deep domain expertise, quality standards, and specific deliverables. + +**Division of responsibility:** +- Our system provides: orchestration, tier structure, task briefs, retries, verification gates, shared state +- Agency-agents provides: the specialist knowledge each agent brings to its role + +T1 selects the right specialist from the roster when building workstream briefs. The specialist's personality is injected as the system prompt at spawn time. + +**Default tier-to-specialist mapping for software pipelines:** + +| Tier | Domain | Agent | +|------|--------|-------| +| T1 | Strategy | nexus-strategy | +| T2 | Backend | software-architect | +| T2 | Infra | devops-automator | +| T2 | Data | data-engineer | +| T3 | Backend | senior-developer | +| T3 | Reliability | sre | +| T4 | Frontend | frontend-developer | +| T4 | Backend | backend-architect | +| T4 | Database | database-optimizer | +| T4 | DevOps | devops-automator | +| T4 | Mobile | mobile-app-builder | +| T4 | AI/ML | ai-engineer | +| T4 | Security | security-engineer | +| T4 | Docs | technical-writer | +| T5 | Code review | code-reviewer | +| T5 | Integration | testing-reality-checker | +| T5 | API | testing-api-tester | +| T5 | Performance | testing-performance-benchmarker | +| T5 | Security | security-engineer | + +The roster is not fixed — T1 can select any agent from the library based on workstream needs. Non-engineering agents (design, marketing, product) extend the system to non-software pipelines. + +--- + +## Adapter Layers + +Everything external is a swappable adapter. Core logic never imports from adapters directly — always through an interface. + +``` +Core (platform-agnostic) +├── team_runner — run lifecycle, agent spawning, runtime selection +├── blackboard — SQLite coordination state +├── task_brief — schema + validation +└── escalation — retry logic, failure routing + +Adapters (swappable) +├── llm/ — anthropic (now), openai, ollama, any API +├── notify/ — openclaw (now), slack, email, webhook... +├── vcs/ — github (now), gitlab, gitea, bare git... +└── runtime/ + ├── standard — openclaw sessions_spawn (T1/T2/T3) + └── coding_agent — claude_code (T4/T5 default), codex, aider... +``` + +Swapping providers means writing a new adapter file — nothing in core changes. + +T4 and T5 default to the **coding agent runtime** when available. It provides direct file system access, git operations, and test execution — no need to shuttle file contents through message context. Falls back to standard runtime gracefully if not configured. + +--- + +## Decisions + +**Depth decision** — T1 assesses scope on receipt and determines how many tiers to engage. Not pre-configured per task type. + +**Trigger mechanism** — User messages Hans → Hans spins up T1 with the goal. T1 takes it from there. + +**Output / review** — Nothing merges to main without Andrew's explicit approval. T1 opens a PR and surfaces it to Andrew for review. Merge is gated on human sign-off. Notification is dual: Hans messages Andrew directly, and a PR is opened on the VCS platform so Andrew gets notified natively too. This keeps the review step platform-independent — whichever VCS is in use, Hans always notifies Andrew directly as a fallback. + +**Retry limits** — Three failure types, handled differently: +- *Bad output* → retry T4 with a corrected brief (default: 3 retries) +- *Blocked* → escalate immediately, no retries +- *Partial output* → salvage good parts, re-task the remainder + +T1 sets a retry budget multiplier during scope assessment (`1x` simple, `2x` complex). Retry budget is a field on the task brief — not hardcoded in the runner. + +**Platform agnosticism** — Core logic is provider and platform agnostic. LLMs, VCS, notifications, and agent runtimes are all adapters. Tiers reference capability levels (`reasoning-heavy`, `capable`, `fast-cheap`), not specific model names. Provider-to-model mapping lives in config. + +**LLM provider** — Anthropic first implementation. Config supports per-tier provider selection and mixing providers across tiers (e.g. T1 on OpenAI o3, T4 workers on local Ollama). + +**Gateway modification** — Decided against. Agent-teams stays standalone Python. OpenClaw is used as the runtime adapter via existing primitives (sessions_spawn, sessions_send, subagents) — called through a skill layer. No gateway fork. Keeps platform agnosticism intact and avoids Node/Python mismatch and fork maintenance burden. + +**Coding agent runtime** — Claude Code is the default T4/T5 runtime for software pipelines. It is purpose-built for implementation and verification: direct file access, git ops, test execution. Enters as a runtime adapter — swappable for Codex, Aider, or any equivalent. T1/T2/T3 always use the standard runtime (they reason, they don't edit files). + +**Claude Code native teams** — Claude Code has an experimental agent teams feature that fans out sub-agents internally within a session. Integrated as an opt-in flag (`native_teams: true`) in the coding_agent runtime adapter. When enabled, T3 hands a full workstream to Claude Code and it parallelises internally — faster, but less granular blackboard visibility. Default is `false` — explicit T4 spawning is the baseline; native teams is a speed optimisation to enable deliberately. + +**Agency-agents integration** — Agent personalities sourced from [msitarzewski/agency-agents](https://github.com/msitarzewski/agency-agents) via git submodule. Included as `agents/` in the repo. T1 selects specialists from the roster via `config/role_registry.yaml`. Each task brief carries an `agent_personality` field (path to the agent .md file) which the runtime adapter injects as the system prompt at spawn time. Adding new specialists means adding an entry to the registry — no core changes required.