test+docs: verify-plan tests and README for verify_task_against_plan
23 unit tests covering parseAcceptanceCriteria, extractKeywords, checkACStatus, computeDriftScore, lineDiff, and 4 end-to-end scenarios (plan unchanged, edited, AC missed, no baseline). README documents the tool with example output and heuristic limitations. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
f51b7a6178
commit
d9f3a7ea40
2 changed files with 241 additions and 0 deletions
43
README.md
43
README.md
|
|
@ -25,9 +25,52 @@ activity and create todos via native tool calls instead of curl.
|
||||||
| `get_question_answer` | Fetch the current status + answer of a previously-asked question | n/a |
|
| `get_question_answer` | Fetch the current status + answer of a previously-asked question | n/a |
|
||||||
| `list_open_questions` | List own open/answered questions, most recent first (max 50) | n/a |
|
| `list_open_questions` | List own open/answered questions, most recent first (max 50) | n/a |
|
||||||
| `cancel_question` | Cancel an own open question (asker-only) | no |
|
| `cancel_question` | Cancel an own open question (asker-only) | no |
|
||||||
|
| `wait_for_job` | Block until a QUEUED ClaudeJob is available, claim it atomically, return full task context with frozen `plan_snapshot` | no |
|
||||||
|
| `update_job_status` | Report job transition to `running`, `done`, or `failed`; triggers SSE event to UI | no |
|
||||||
|
| `verify_task_against_plan` | Compare frozen `plan_snapshot` against current plan + story logs + commits; returns per-AC ✓/✗/? heuristic and drift-score | yes (read-only) |
|
||||||
|
|
||||||
Demo accounts may read but writes return `PERMISSION_DENIED`.
|
Demo accounts may read but writes return `PERMISSION_DENIED`.
|
||||||
|
|
||||||
|
### verify_task_against_plan
|
||||||
|
|
||||||
|
Compares the immutable snapshot captured at claim time against the current state of the work. Useful at the end of a job to self-assess completeness.
|
||||||
|
|
||||||
|
**Input**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{ "task_id": "cmolqlqvh0023q..." }
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output**
|
||||||
|
|
||||||
|
```
|
||||||
|
# Verify task: Prisma-schema + migratie in Scrum4Me (cmolqlqvh...)
|
||||||
|
|
||||||
|
## Plan
|
||||||
|
- Snapshot: - Bewerk prisma/schema.prisma:...
|
||||||
|
- Current: - Bewerk prisma/schema.prisma:...
|
||||||
|
- Edited onderweg: **no**
|
||||||
|
|
||||||
|
## AC-checks (5/6 ✓ — drift-score 83%)
|
||||||
|
- ✓ Scrum4Me prisma/schema.prisma: nieuw veld plan_snapshot...
|
||||||
|
- ✓ Migratie aangemaakt en getest
|
||||||
|
- ✗ vendor/scrum4me submodule in scrum4me-mcp gebumpt
|
||||||
|
|
||||||
|
## Realisatie
|
||||||
|
- 1 log_implementation-entry
|
||||||
|
- commit `a3af2dd` — feat: add plan_snapshot field to ClaudeJob schema
|
||||||
|
|
||||||
|
---
|
||||||
|
⚠️ Heuristiek-rapport — handmatige PR-review blijft nodig
|
||||||
|
```
|
||||||
|
|
||||||
|
**Beperkingen heuristiek**
|
||||||
|
|
||||||
|
- Zoekt op sleutelwoorden (filenames, camelCase-identifiers, lange woorden) — geen semantisch begrip
|
||||||
|
- AC's die alleen over externe verificatie gaan (deployment, user-test) scoren altijd ✗ zonder extra log-entries
|
||||||
|
- Plan_snapshot is NULL voor jobs die zijn geclaimed vóór versie met snapshot-feature — rapport meldt "no baseline"
|
||||||
|
- Gebruik het rapport als startpunt, niet als definitief oordeel; PR-review blijft leidend
|
||||||
|
|
||||||
## Prompts
|
## Prompts
|
||||||
|
|
||||||
- `implement_next_story` — full workflow: fetch context, log plan, walk
|
- `implement_next_story` — full workflow: fetch context, log plan, walk
|
||||||
|
|
|
||||||
198
__tests__/verify-plan.test.ts
Normal file
198
__tests__/verify-plan.test.ts
Normal file
|
|
@ -0,0 +1,198 @@
|
||||||
|
import { describe, it, expect } from 'vitest'
|
||||||
|
import {
|
||||||
|
parseAcceptanceCriteria,
|
||||||
|
extractKeywords,
|
||||||
|
checkACStatus,
|
||||||
|
computeDriftScore,
|
||||||
|
lineDiff,
|
||||||
|
buildVerifyResult,
|
||||||
|
renderMarkdownReport,
|
||||||
|
} from '../src/lib/verify-plan.js'
|
||||||
|
|
||||||
|
describe('parseAcceptanceCriteria', () => {
|
||||||
|
it('returns empty array for null', () => {
|
||||||
|
expect(parseAcceptanceCriteria(null)).toEqual([])
|
||||||
|
})
|
||||||
|
|
||||||
|
it('parses dash-prefixed lines', () => {
|
||||||
|
const text = '- First AC\n- Second AC\n- Third AC'
|
||||||
|
expect(parseAcceptanceCriteria(text)).toEqual(['First AC', 'Second AC', 'Third AC'])
|
||||||
|
})
|
||||||
|
|
||||||
|
it('strips numbered prefixes', () => {
|
||||||
|
const text = '1. Do this\n2. Do that'
|
||||||
|
expect(parseAcceptanceCriteria(text)).toEqual(['Do this', 'Do that'])
|
||||||
|
})
|
||||||
|
|
||||||
|
it('ignores blank lines', () => {
|
||||||
|
const text = '- AC1\n\n- AC2'
|
||||||
|
expect(parseAcceptanceCriteria(text)).toEqual(['AC1', 'AC2'])
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
describe('extractKeywords', () => {
|
||||||
|
it('extracts filenames with extensions', () => {
|
||||||
|
const kws = extractKeywords('update wait-for-job.ts and verify-plan.ts')
|
||||||
|
expect(kws).toContain('wait-for-job.ts')
|
||||||
|
expect(kws).toContain('verify-plan.ts')
|
||||||
|
})
|
||||||
|
|
||||||
|
it('extracts long words', () => {
|
||||||
|
const kws = extractKeywords('implementation snapshot detection')
|
||||||
|
expect(kws).toContain('implementation')
|
||||||
|
expect(kws).toContain('snapshot')
|
||||||
|
expect(kws).toContain('detection')
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns unique keywords', () => {
|
||||||
|
const kws = extractKeywords('implementation implementation')
|
||||||
|
const count = kws.filter((k) => k === 'implementation').length
|
||||||
|
expect(count).toBe(1)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
describe('checkACStatus', () => {
|
||||||
|
it('returns ✓ when majority of keywords found in corpus', () => {
|
||||||
|
const ac = 'plan_snapshot field added to ClaudeJob'
|
||||||
|
const corpus = 'added plan_snapshot field to claudejob schema migration'
|
||||||
|
expect(checkACStatus(ac, corpus)).toBe('✓')
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns ✗ when no keywords found', () => {
|
||||||
|
const ac = 'zxqwerty obscure feature nobody implemented'
|
||||||
|
const corpus = 'completely different log content about other things'
|
||||||
|
expect(checkACStatus(ac, corpus)).toBe('✗')
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns ? when partial match', () => {
|
||||||
|
const ac = 'snapshot captured at claim time with plan_snapshot field'
|
||||||
|
const corpus = 'snapshot written to database'
|
||||||
|
const result = checkACStatus(ac, corpus)
|
||||||
|
expect(['?', '✓']).toContain(result)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns ? for very short AC with no extractable keywords', () => {
|
||||||
|
expect(checkACStatus('Ok', 'anything')).toBe('?')
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
describe('computeDriftScore', () => {
|
||||||
|
it('returns 100 when all pass', () => {
|
||||||
|
const results = [{ status: '✓' as const }, { status: '✓' as const }]
|
||||||
|
expect(computeDriftScore(results)).toBe(100)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns 0 when all fail', () => {
|
||||||
|
const results = [{ status: '✗' as const }, { status: '✗' as const }]
|
||||||
|
expect(computeDriftScore(results)).toBe(0)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns 50 for half passing', () => {
|
||||||
|
const results = [{ status: '✓' as const }, { status: '✗' as const }]
|
||||||
|
expect(computeDriftScore(results)).toBe(50)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns 0 for empty list', () => {
|
||||||
|
expect(computeDriftScore([])).toBe(0)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
describe('lineDiff', () => {
|
||||||
|
it('returns null when strings are identical', () => {
|
||||||
|
expect(lineDiff('line1\nline2', 'line1\nline2')).toBeNull()
|
||||||
|
})
|
||||||
|
|
||||||
|
it('shows added lines with +', () => {
|
||||||
|
const diff = lineDiff('line1', 'line1\nline2')
|
||||||
|
expect(diff).toContain('+ line2')
|
||||||
|
})
|
||||||
|
|
||||||
|
it('shows removed lines with -', () => {
|
||||||
|
const diff = lineDiff('line1\nline2', 'line1')
|
||||||
|
expect(diff).toContain('- line2')
|
||||||
|
})
|
||||||
|
|
||||||
|
it('shows changed lines as remove+add pair', () => {
|
||||||
|
const diff = lineDiff('old line', 'new line')
|
||||||
|
expect(diff).toContain('- old line')
|
||||||
|
expect(diff).toContain('+ new line')
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
describe('buildVerifyResult + renderMarkdownReport', () => {
|
||||||
|
it('scenario: plan unchanged, all ACs matched in logs — 100% drift score', () => {
|
||||||
|
const plan = 'Add plan_snapshot field to ClaudeJob schema'
|
||||||
|
const result = buildVerifyResult({
|
||||||
|
taskId: 'task-1',
|
||||||
|
taskTitle: 'Prisma migration',
|
||||||
|
planSnapshot: plan,
|
||||||
|
currentPlan: plan,
|
||||||
|
acceptanceCriteriaText: '- plan_snapshot field added\n- migration created',
|
||||||
|
implementationLogs: ['Added plan_snapshot field, created migration file for claudejob'],
|
||||||
|
commits: [{ hash: 'abc123', message: 'feat: add plan_snapshot to claudejob schema' }],
|
||||||
|
})
|
||||||
|
|
||||||
|
expect(result.planEdited).toBe(false)
|
||||||
|
expect(result.planDiff).toBeNull()
|
||||||
|
expect(result.hasBaseline).toBe(true)
|
||||||
|
expect(result.driftScore).toBeGreaterThanOrEqual(50)
|
||||||
|
|
||||||
|
const report = renderMarkdownReport(result)
|
||||||
|
expect(report).toContain('Edited onderweg: **no**')
|
||||||
|
expect(report).toContain('drift-score')
|
||||||
|
})
|
||||||
|
|
||||||
|
it('scenario: plan edited onderweg — planEdited=true, diff in output', () => {
|
||||||
|
const result = buildVerifyResult({
|
||||||
|
taskId: 'task-2',
|
||||||
|
taskTitle: 'Wait for job update',
|
||||||
|
planSnapshot: 'Original plan\nStep 1',
|
||||||
|
currentPlan: 'Original plan\nStep 1 revised\nStep 2 added',
|
||||||
|
acceptanceCriteriaText: null,
|
||||||
|
implementationLogs: [],
|
||||||
|
commits: [],
|
||||||
|
})
|
||||||
|
|
||||||
|
expect(result.planEdited).toBe(true)
|
||||||
|
expect(result.planDiff).not.toBeNull()
|
||||||
|
expect(result.planDiff).toContain('- Step 1')
|
||||||
|
expect(result.planDiff).toContain('+ Step 1 revised')
|
||||||
|
|
||||||
|
const report = renderMarkdownReport(result)
|
||||||
|
expect(report).toContain('Edited onderweg: **yes**')
|
||||||
|
expect(report).toContain('```diff')
|
||||||
|
})
|
||||||
|
|
||||||
|
it('scenario: AC without match in logs → ✗', () => {
|
||||||
|
const result = buildVerifyResult({
|
||||||
|
taskId: 'task-3',
|
||||||
|
taskTitle: 'Unimplemented feature',
|
||||||
|
planSnapshot: 'some plan',
|
||||||
|
currentPlan: 'some plan',
|
||||||
|
acceptanceCriteriaText: '- zxcvbnm_completely_missing_feature deployed',
|
||||||
|
implementationLogs: ['unrelated work done here'],
|
||||||
|
commits: [],
|
||||||
|
})
|
||||||
|
|
||||||
|
expect(result.acceptanceCriteria[0].status).toBe('✗')
|
||||||
|
expect(result.driftScore).toBe(0)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('scenario: stale claim (snapshot null) → no baseline in report', () => {
|
||||||
|
const result = buildVerifyResult({
|
||||||
|
taskId: 'task-4',
|
||||||
|
taskTitle: 'Old job',
|
||||||
|
planSnapshot: null,
|
||||||
|
currentPlan: 'current plan',
|
||||||
|
acceptanceCriteriaText: '- something done',
|
||||||
|
implementationLogs: ['something done here'],
|
||||||
|
commits: [],
|
||||||
|
})
|
||||||
|
|
||||||
|
expect(result.hasBaseline).toBe(false)
|
||||||
|
expect(result.planEdited).toBe(false)
|
||||||
|
|
||||||
|
const report = renderMarkdownReport(result)
|
||||||
|
expect(report).toContain('no baseline')
|
||||||
|
})
|
||||||
|
})
|
||||||
Loading…
Add table
Add a link
Reference in a new issue