feat: stale watchdog, review orchestration, loop-break prompt
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
This commit is contained in:
parent
7b5eb15292
commit
fbc5e33292
2 changed files with 298 additions and 6 deletions
|
|
@ -164,6 +164,12 @@ Instructions:
|
|||
5. Leave a comment on the Vikunja task summarising what you did and linking any PR.
|
||||
6. Do not mark the task as done — the entrypoint will move it to In Review when you finish.
|
||||
7. Do not ask for confirmation — act autonomously within your role constraints.
|
||||
|
||||
IMPORTANT — avoid looping:
|
||||
- If you attempt the same fix more than twice and it is still not working, STOP.
|
||||
- Do not keep retrying the same approach hoping for a different result.
|
||||
- Instead: leave a Vikunja comment explaining exactly what you tried and what is blocking you, then exit.
|
||||
- A human will read it and unblock you. Spinning wastes time and money.
|
||||
"""
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -3,9 +3,13 @@
|
|||
AutoJanet Dispatcher
|
||||
|
||||
Runs as a CronJob every 2 minutes. Discovers the target Vikunja project by
|
||||
name, resolves the kanban view and all 5 standard bucket IDs by name, then
|
||||
claims tasks from the Todo bucket that have an `agent:<role>` label and
|
||||
spawns a Kubernetes Job for the appropriate agent.
|
||||
name, resolves the kanban view and all 5 standard bucket IDs by name, then:
|
||||
|
||||
1. Sweeps done=True tasks into the Done bucket.
|
||||
2. Watchdog: moves stale InProgress tasks (dead job) back to Todo.
|
||||
3. Review orchestration: spawns a PM agent for each task in Review that
|
||||
needs sub-tasks created (code-review, security-review, etc.).
|
||||
4. Claims tasks from the Todo bucket and spawns agent Jobs.
|
||||
|
||||
Config (env vars):
|
||||
OPENBAO_ADDR, OPENBAO_ROLE_ID, OPENBAO_SECRET_ID
|
||||
|
|
@ -45,6 +49,7 @@ AGENT_IMAGE = os.environ.get("AGENT_IMAGE", "registry.ctz.fyi/library/autojane
|
|||
# Max concurrent jobs per role (and global across all roles)
|
||||
MAX_JOBS_PER_ROLE = int(os.environ.get("MAX_JOBS_PER_ROLE", "2"))
|
||||
MAX_JOBS_TOTAL = int(os.environ.get("MAX_JOBS_TOTAL", "10"))
|
||||
MAX_TASK_RETRIES = int(os.environ.get("MAX_TASK_RETRIES", "3"))
|
||||
|
||||
# Standard bucket names (case-insensitive match)
|
||||
BUCKET_BACKLOG = "backlog"
|
||||
|
|
@ -227,6 +232,276 @@ def sweep_done_tasks(
|
|||
log.info("Done sweep: moved %d tasks", moved)
|
||||
|
||||
|
||||
# ── Role extraction ───────────────────────────────────────────────────────────
|
||||
|
||||
def extract_agent_role(task: dict) -> str | None:
|
||||
"""Return the agent role from an agent:<role> label, or None."""
|
||||
for label in task.get("labels") or []:
|
||||
title = label.get("title", "")
|
||||
if title.startswith("agent:"):
|
||||
role = title[len("agent:"):]
|
||||
if role in VALID_ROLES:
|
||||
return role
|
||||
return None
|
||||
|
||||
|
||||
# ── Stale watchdog ────────────────────────────────────────────────────────────
|
||||
|
||||
def count_jobs_for_task(batch_v1: k8s_client.BatchV1Api, task_id: int) -> tuple[int, int]:
|
||||
"""
|
||||
Return (active_count, total_ever) for all jobs with this task_id label.
|
||||
active = job has no terminal outcome yet (still running or pending).
|
||||
total_ever = ALL jobs ever spawned for this task, regardless of outcome.
|
||||
Used to detect both dead-job stale and successful-but-looping scenarios.
|
||||
"""
|
||||
jobs = batch_v1.list_namespaced_job(
|
||||
namespace=K8S_NAMESPACE,
|
||||
label_selector=f"autojanet/task-id={task_id}",
|
||||
_request_timeout=15,
|
||||
)
|
||||
active = 0
|
||||
total_ever = len(jobs.items)
|
||||
for job in jobs.items:
|
||||
s = job.status
|
||||
is_succeeded = (s.succeeded or 0) > 0
|
||||
is_failed = (s.failed or 0) >= (job.spec.backoff_limit or 1) + 1
|
||||
if not is_succeeded and not is_failed:
|
||||
active += 1
|
||||
return active, total_ever
|
||||
|
||||
|
||||
def post_stale_comment(vikunja_token: str, task_id: int, reason: str) -> None:
|
||||
try:
|
||||
httpx.put(
|
||||
f"{VIKUNJA_BASE_URL}/api/v1/tasks/{task_id}/comments",
|
||||
headers={"Authorization": f"Bearer {vikunja_token}", "Content-Type": "application/json"},
|
||||
json={"comment": reason},
|
||||
timeout=10,
|
||||
).raise_for_status()
|
||||
except Exception as e:
|
||||
log.warning("Failed to post stale comment on task %d: %s", task_id, e)
|
||||
|
||||
|
||||
def watchdog_stale_tasks(
|
||||
vikunja_token: str,
|
||||
batch_v1: k8s_client.BatchV1Api,
|
||||
project_id: int,
|
||||
view_id: int,
|
||||
in_progress_id: int,
|
||||
todo_id: int,
|
||||
backlog_id: int,
|
||||
) -> None:
|
||||
"""
|
||||
For every task stuck in InProgress, check whether its Job is still alive.
|
||||
- No job found or job terminally failed:
|
||||
- attempt count < MAX_TASK_RETRIES → move back to Todo
|
||||
- attempt count >= MAX_TASK_RETRIES → move to Backlog + comment
|
||||
"""
|
||||
page = 1
|
||||
tasks = []
|
||||
while True:
|
||||
batch = vikunja_get(vikunja_token, f"projects/{project_id}/tasks", page=page, per_page=50)
|
||||
if not batch:
|
||||
break
|
||||
tasks.extend(batch)
|
||||
if len(batch) < 50:
|
||||
break
|
||||
page += 1
|
||||
|
||||
stale = [t for t in tasks if not t.get("done") and t.get("bucket_id") == in_progress_id]
|
||||
log.info("Watchdog: checking %d InProgress tasks", len(stale))
|
||||
|
||||
for task in stale:
|
||||
task_id = task["id"]
|
||||
active, total_ever = count_jobs_for_task(batch_v1, task_id)
|
||||
|
||||
if active > 0:
|
||||
log.debug("Task %d has %d active job(s), leaving alone", task_id, active)
|
||||
continue
|
||||
|
||||
# No active job — task is stuck or looping
|
||||
log.warning("Task %d is stale: active=%d total_attempts=%d", task_id, active, total_ever)
|
||||
|
||||
if total_ever >= MAX_TASK_RETRIES:
|
||||
log.error("Task %d hit retry limit (%d/%d), moving to Backlog", task_id, total_ever, MAX_TASK_RETRIES)
|
||||
try:
|
||||
vikunja_post(
|
||||
vikunja_token,
|
||||
f"projects/{project_id}/views/{view_id}/buckets/{backlog_id}/tasks",
|
||||
{"task_id": task_id},
|
||||
)
|
||||
except Exception as e:
|
||||
log.warning("Failed to move task %d to Backlog: %s", task_id, e)
|
||||
post_stale_comment(
|
||||
vikunja_token, task_id,
|
||||
f"🚨 **Watchdog**: task has been attempted {total_ever} time(s) and hit the retry limit "
|
||||
f"(`MAX_TASK_RETRIES={MAX_TASK_RETRIES}`). Moved to **Backlog** for manual review."
|
||||
)
|
||||
else:
|
||||
log.info("Task %d attempt %d/%d, requeueing to Todo", task_id, total_ever + 1, MAX_TASK_RETRIES)
|
||||
try:
|
||||
vikunja_post(
|
||||
vikunja_token,
|
||||
f"projects/{project_id}/views/{view_id}/buckets/{todo_id}/tasks",
|
||||
{"task_id": task_id},
|
||||
)
|
||||
except Exception as e:
|
||||
log.warning("Failed to requeue task %d to Todo: %s", task_id, e)
|
||||
post_stale_comment(
|
||||
vikunja_token, task_id,
|
||||
f"⚠️ **Watchdog**: no active job found for this task (attempt {total_ever + 1}/{MAX_TASK_RETRIES}). "
|
||||
f"Requeued to **Todo** for retry."
|
||||
)
|
||||
|
||||
|
||||
# ── Review orchestration ──────────────────────────────────────────────────────
|
||||
|
||||
# Roles that should NOT trigger review orchestration (would cause loops)
|
||||
REVIEW_SKIP_ROLES = {"pm", "code-reviewer"}
|
||||
|
||||
|
||||
def spawn_review_pm_job(
|
||||
batch_v1: k8s_client.BatchV1Api,
|
||||
task_id: int,
|
||||
task_title: str,
|
||||
in_review_bucket_id: int,
|
||||
project_id: int,
|
||||
view_id: int,
|
||||
) -> None:
|
||||
"""Spawn a PM agent job to orchestrate review sub-tasks for a completed task."""
|
||||
name = f"review-pm-{task_id}"
|
||||
if job_already_exists(batch_v1, name):
|
||||
log.debug("Review PM job %s already exists, skipping", name)
|
||||
return
|
||||
|
||||
job = k8s_client.V1Job(
|
||||
api_version="batch/v1",
|
||||
kind="Job",
|
||||
metadata=k8s_client.V1ObjectMeta(
|
||||
name=name,
|
||||
namespace=K8S_NAMESPACE,
|
||||
labels={
|
||||
"autojanet/type": "review-pm",
|
||||
"autojanet/role": "pm",
|
||||
"autojanet/task-id": str(task_id),
|
||||
},
|
||||
),
|
||||
spec=k8s_client.V1JobSpec(
|
||||
ttl_seconds_after_finished=3600,
|
||||
backoff_limit=1,
|
||||
template=k8s_client.V1PodTemplateSpec(
|
||||
metadata=k8s_client.V1ObjectMeta(
|
||||
labels={
|
||||
"autojanet/type": "review-pm",
|
||||
"autojanet/role": "pm",
|
||||
"autojanet/task-id": str(task_id),
|
||||
}
|
||||
),
|
||||
spec=k8s_client.V1PodSpec(
|
||||
service_account_name="agent-pm",
|
||||
restart_policy="Never",
|
||||
node_selector={"kubernetes.io/arch": "amd64"},
|
||||
containers=[
|
||||
k8s_client.V1Container(
|
||||
name="agent",
|
||||
image=AGENT_IMAGE,
|
||||
image_pull_policy="Always",
|
||||
env=[
|
||||
k8s_client.V1EnvVar(name="AGENT_ROLE", value="pm"),
|
||||
k8s_client.V1EnvVar(name="TASK_TYPE", value="review_orchestration"),
|
||||
k8s_client.V1EnvVar(name="TASK_ID", value=str(task_id)),
|
||||
k8s_client.V1EnvVar(name="TASK_TITLE", value=task_title),
|
||||
k8s_client.V1EnvVar(name="OPENBAO_ADDR", value=OPENBAO_ADDR),
|
||||
k8s_client.V1EnvVar(name="VIKUNJA_BASE_URL", value=VIKUNJA_BASE_URL),
|
||||
k8s_client.V1EnvVar(name="LITELLM_BASE_URL", value="https://llm.ctz.fyi"),
|
||||
k8s_client.V1EnvVar(name="FORGEJO_BASE_URL", value="https://git.ctz.fyi"),
|
||||
k8s_client.V1EnvVar(name="IN_REVIEW_BUCKET_ID", value=str(in_review_bucket_id)),
|
||||
k8s_client.V1EnvVar(name="VIKUNJA_PROJECT_ID", value=str(project_id)),
|
||||
k8s_client.V1EnvVar(name="VIKUNJA_VIEW_ID", value=str(view_id)),
|
||||
k8s_client.V1EnvVar(
|
||||
name="OPENBAO_ROLE_ID",
|
||||
value_from=k8s_client.V1EnvVarSource(
|
||||
secret_key_ref=k8s_client.V1SecretKeySelector(
|
||||
name="agent-pm-approle", key="role_id",
|
||||
)
|
||||
),
|
||||
),
|
||||
k8s_client.V1EnvVar(
|
||||
name="OPENBAO_SECRET_ID",
|
||||
value_from=k8s_client.V1EnvVarSource(
|
||||
secret_key_ref=k8s_client.V1SecretKeySelector(
|
||||
name="agent-pm-approle", key="secret_id",
|
||||
)
|
||||
),
|
||||
),
|
||||
],
|
||||
resources=k8s_client.V1ResourceRequirements(
|
||||
requests={"cpu": "250m", "memory": "512Mi"},
|
||||
limits={"cpu": "2000m", "memory": "2Gi"},
|
||||
),
|
||||
security_context=k8s_client.V1SecurityContext(
|
||||
allow_privilege_escalation=False,
|
||||
run_as_non_root=True,
|
||||
run_as_user=1000,
|
||||
capabilities=k8s_client.V1Capabilities(drop=["ALL"]),
|
||||
),
|
||||
)
|
||||
],
|
||||
),
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
log.info("Spawning review PM job %s for task %d", name, task_id)
|
||||
batch_v1.create_namespaced_job(namespace=K8S_NAMESPACE, body=job, _request_timeout=30)
|
||||
|
||||
|
||||
def orchestrate_review_tasks(
|
||||
vikunja_token: str,
|
||||
batch_v1: k8s_client.BatchV1Api,
|
||||
project_id: int,
|
||||
view_id: int,
|
||||
in_review_id: int,
|
||||
) -> None:
|
||||
"""
|
||||
Scan the Review bucket. For each task that has a non-pm/non-reviewer agent
|
||||
label and no review-pm job yet, spawn a PM agent to create review sub-tasks.
|
||||
"""
|
||||
page = 1
|
||||
tasks = []
|
||||
while True:
|
||||
batch = vikunja_get(vikunja_token, f"projects/{project_id}/tasks", page=page, per_page=50)
|
||||
if not batch:
|
||||
break
|
||||
tasks.extend(batch)
|
||||
if len(batch) < 50:
|
||||
break
|
||||
page += 1
|
||||
|
||||
review_tasks = [
|
||||
t for t in tasks
|
||||
if not t.get("done") and t.get("bucket_id") == in_review_id
|
||||
]
|
||||
log.info("Review orchestration: checking %d tasks in Review bucket", len(review_tasks))
|
||||
|
||||
for task in review_tasks:
|
||||
task_id = task["id"]
|
||||
role = extract_agent_role(task)
|
||||
|
||||
if not role or role in REVIEW_SKIP_ROLES:
|
||||
log.debug("Task %d role=%s skipped for review orchestration", task_id, role)
|
||||
continue
|
||||
|
||||
spawn_review_pm_job(
|
||||
batch_v1,
|
||||
task_id=task_id,
|
||||
task_title=task.get("title", f"Task {task_id}"),
|
||||
in_review_bucket_id=in_review_id,
|
||||
project_id=project_id,
|
||||
view_id=view_id,
|
||||
)
|
||||
|
||||
|
||||
# ── Kubernetes ────────────────────────────────────────────────────────────────
|
||||
|
||||
def load_k8s_config() -> None:
|
||||
|
|
@ -388,8 +663,9 @@ def main() -> None:
|
|||
in_progress_id = buckets.get(BUCKET_IN_PROGRESS)
|
||||
in_review_id = buckets.get(BUCKET_IN_REVIEW)
|
||||
done_id = buckets.get(BUCKET_DONE)
|
||||
backlog_id = buckets.get(BUCKET_BACKLOG)
|
||||
|
||||
if not all([todo_id, in_progress_id, in_review_id, done_id]):
|
||||
if not all([todo_id, in_progress_id, in_review_id, done_id, backlog_id]):
|
||||
log.error("Could not find all standard buckets. Found: %s", list(buckets.keys()))
|
||||
sys.exit(1)
|
||||
|
||||
|
|
@ -397,10 +673,20 @@ def main() -> None:
|
|||
load_k8s_config()
|
||||
batch_v1 = k8s_client.BatchV1Api()
|
||||
|
||||
# Sweep: move any done=True tasks into the Done bucket
|
||||
# 1. Sweep: move any done=True tasks into the Done bucket
|
||||
sweep_done_tasks(vikunja_token, project_id, view_id, done_id)
|
||||
|
||||
# Scan Todo bucket for claimable tasks
|
||||
# 2. Watchdog: requeue or escalate stale InProgress tasks
|
||||
watchdog_stale_tasks(
|
||||
vikunja_token, batch_v1,
|
||||
project_id, view_id,
|
||||
in_progress_id, todo_id, backlog_id,
|
||||
)
|
||||
|
||||
# 3. Review orchestration: spawn PM jobs for tasks awaiting review
|
||||
orchestrate_review_tasks(vikunja_token, batch_v1, project_id, view_id, in_review_id)
|
||||
|
||||
# 4. Scan Todo bucket for claimable tasks
|
||||
tasks = list_todo_tasks(vikunja_token, project_id, todo_id)
|
||||
log.info("Found %d candidate tasks in Todo bucket", len(tasks))
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue