Data Incident RCA Automation

Python CLI tool to create pipeline incidents, append timeline events, close with root cause analysis, and export structured RCA JSON reports. Integrate with Slack alerting.

Data ObservabilityIntermediatePython

Code preview

154 lines

Replace {{PLACEHOLDERS}} with your environment values, then deploy to your stack.

"""
DATA INCIDENT RCA AUTOMATION
Log pipeline incidents, capture timeline, and export RCA report.

Usage:
  python incident_rca_automation.py create --title "Orders mart stale"
  python incident_rca_automation.py close --incident-id INC-2026-001 --root-cause "Upstream delay"
"""

from __future__ import annotations

import argparse
import json
from dataclasses import asdict, dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Literal

# ── CONFIGURATION ─────────────────────────────────────────────────────────────
RCA_STORE_PATH = Path("{{RCA_OUTPUT_DIR}}")   # e.g. ./incidents or s3 path wrapper
DEFAULT_OWNER = "{{OWNER_TEAM}}"               # e.g. data-platform
DEFAULT_SEVERITY = "{{DEFAULT_SEVERITY}}"      # SEV1 | SEV2 | SEV3 | SEV4
ALERT_CHANNEL = "{{SLACK_CHANNEL}}"            # e.g. #data-incidents

Severity = Literal["SEV1", "SEV2", "SEV3", "SEV4"]
Status = Literal["OPEN", "MITIGATED", "CLOSED"]


@dataclass
class TimelineEvent:
    timestamp: str
    actor: str
    event: str


@dataclass
class Incident:
    incident_id: str
    title: str
    severity: Severity
    status: Status
    owner: str
    pipeline: str
    detected_at: str
    resolved_at: str | None = None
    customer_impact: str = ""
    root_cause: str = ""
    contributing_factors: list[str] = field(default_factory=list)
    corrective_actions: list[dict] = field(default_factory=list)
    timeline: list[TimelineEvent] = field(default_factory=list)


def _now() -> str:
    return datetime.now(timezone.utc).isoformat()


def create_incident(title: str, pipeline: str, severity: Severity, impact: str) -> Incident:
    incident_id = f"INC-{datetime.now(timezone.utc):%Y%m%d-%H%M%S}"
    incident = Incident(
        incident_id=incident_id,
        title=title,
        severity=severity,
        status="OPEN",
        owner=DEFAULT_OWNER,
        pipeline=pipeline,
        detected_at=_now(),
        customer_impact=impact,
        timeline=[
            TimelineEvent(_now(), "system", f"Incident created: {title}"),
        ],
    )
    _save(incident)
    _notify(f"[{severity}] {incident_id}: {title} - pipeline `{pipeline}`")
    return incident


def add_timeline_event(incident_id: str, actor: str, event: str) -> None:
    incident = _load(incident_id)
    incident.timeline.append(TimelineEvent(_now(), actor, event))
    _save(incident)

// ... download full template for remaining code

About this template

Python CLI tool to create pipeline incidents, append timeline events, close with root cause analysis, and export structured RCA JSON reports. Integrate with Slack alerting.

incidentrcaautomationon-call
Downloads38
Reviews0
Rating-
CreatedJul 2, 2026
UpdatedJul 2, 2026
Login to review