Data Incident RCA Automation
Python CLI tool to create pipeline incidents, append timeline events, close with root cause analysis, and export structured RCA JSON reports. Integrate with Slack alerting.
Data ObservabilityIntermediatePython
Code preview
154 linesReplace {{PLACEHOLDERS}} with your environment values, then deploy to your stack.
"""
DATA INCIDENT RCA AUTOMATION
Log pipeline incidents, capture timeline, and export RCA report.
Usage:
python incident_rca_automation.py create --title "Orders mart stale"
python incident_rca_automation.py close --incident-id INC-2026-001 --root-cause "Upstream delay"
"""
from __future__ import annotations
import argparse
import json
from dataclasses import asdict, dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Literal
# ── CONFIGURATION ─────────────────────────────────────────────────────────────
RCA_STORE_PATH = Path("{{RCA_OUTPUT_DIR}}") # e.g. ./incidents or s3 path wrapper
DEFAULT_OWNER = "{{OWNER_TEAM}}" # e.g. data-platform
DEFAULT_SEVERITY = "{{DEFAULT_SEVERITY}}" # SEV1 | SEV2 | SEV3 | SEV4
ALERT_CHANNEL = "{{SLACK_CHANNEL}}" # e.g. #data-incidents
Severity = Literal["SEV1", "SEV2", "SEV3", "SEV4"]
Status = Literal["OPEN", "MITIGATED", "CLOSED"]
@dataclass
class TimelineEvent:
timestamp: str
actor: str
event: str
@dataclass
class Incident:
incident_id: str
title: str
severity: Severity
status: Status
owner: str
pipeline: str
detected_at: str
resolved_at: str | None = None
customer_impact: str = ""
root_cause: str = ""
contributing_factors: list[str] = field(default_factory=list)
corrective_actions: list[dict] = field(default_factory=list)
timeline: list[TimelineEvent] = field(default_factory=list)
def _now() -> str:
return datetime.now(timezone.utc).isoformat()
def create_incident(title: str, pipeline: str, severity: Severity, impact: str) -> Incident:
incident_id = f"INC-{datetime.now(timezone.utc):%Y%m%d-%H%M%S}"
incident = Incident(
incident_id=incident_id,
title=title,
severity=severity,
status="OPEN",
owner=DEFAULT_OWNER,
pipeline=pipeline,
detected_at=_now(),
customer_impact=impact,
timeline=[
TimelineEvent(_now(), "system", f"Incident created: {title}"),
],
)
_save(incident)
_notify(f"[{severity}] {incident_id}: {title} - pipeline `{pipeline}`")
return incident
def add_timeline_event(incident_id: str, actor: str, event: str) -> None:
incident = _load(incident_id)
incident.timeline.append(TimelineEvent(_now(), actor, event))
_save(incident)
// ... download full template for remaining codeAbout this template
Python CLI tool to create pipeline incidents, append timeline events, close with root cause analysis, and export structured RCA JSON reports. Integrate with Slack alerting.
incidentrcaautomationon-call
Downloads38
Reviews0
Rating-
CreatedJul 2, 2026
UpdatedJul 2, 2026