Coverage for src / mcp_server_langgraph / health / checks.py: 100%
69 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 00:43 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 00:43 +0000
1"""
2Health check endpoints for Kubernetes probes
3"""
5from datetime import datetime, UTC
6from typing import Any
8from fastapi import FastAPI, status
9from fastapi.responses import JSONResponse
10from pydantic import BaseModel
12from mcp_server_langgraph.auth.openfga import OpenFGAClient
13from mcp_server_langgraph.core.config import settings
14from mcp_server_langgraph.observability.telemetry import logger
15from mcp_server_langgraph.secrets.manager import get_secrets_manager
17app = FastAPI(title="MCP Server with LangGraph Health")
20class HealthResponse(BaseModel):
21 """Health check response model"""
23 status: str
24 timestamp: str
25 version: str
26 checks: dict[str, Any]
29@app.get("/")
30async def health_check() -> HealthResponse:
31 """
32 Liveness probe - returns 200 if application is running
34 Used by Kubernetes to determine if pod should be restarted
36 NOTE: Mounted at /health in main app, so accessible at /health/
37 """
38 return HealthResponse(
39 status="healthy",
40 timestamp=datetime.now(UTC).isoformat(),
41 version=settings.service_version,
42 checks={"application": "running"},
43 )
46@app.get("/live")
47async def liveness_check() -> HealthResponse:
48 """
49 Liveness probe - same as root health check
51 Used by Kubernetes liveness probe at /health/live
52 """
53 return await health_check()
56@app.get("/ready", response_model=None)
57async def readiness_check() -> JSONResponse:
58 """
59 Readiness probe - returns 200 if application can serve traffic
61 Used by Kubernetes to determine if pod should receive traffic
62 """
63 checks = {}
64 all_healthy = True
66 # Check OpenFGA connection
67 if settings.openfga_store_id and settings.openfga_model_id:
68 try:
69 OpenFGAClient(
70 api_url=settings.openfga_api_url, store_id=settings.openfga_store_id, model_id=settings.openfga_model_id
71 )
72 # Simple check - if client initializes, connection is OK
73 checks["openfga"] = {"status": "healthy", "url": settings.openfga_api_url}
74 except Exception as e:
75 checks["openfga"] = {"status": "unhealthy", "error": str(e)}
76 all_healthy = False
77 logger.error(f"OpenFGA health check failed: {e}")
78 else:
79 checks["openfga"] = {"status": "not_configured", "message": "OpenFGA not configured"}
81 # Check Infisical connection (optional)
82 try:
83 secrets_mgr = get_secrets_manager()
84 if secrets_mgr.client:
85 # Test secret retrieval
86 secrets_mgr.get_secret("HEALTH_CHECK_TEST", fallback="ok")
87 checks["infisical"] = {"status": "healthy", "url": settings.infisical_site_url}
88 else:
89 checks["infisical"] = {"status": "not_configured", "message": "Using environment variables"}
90 except Exception as e:
91 checks["infisical"] = {"status": "degraded", "message": "Fallback mode active", "error": str(e)}
92 # Don't fail readiness if Infisical is down (we have fallback)
93 logger.warning(f"Infisical health check failed: {e}")
95 # Check critical secrets exist
96 critical_secrets_missing = []
97 if not settings.anthropic_api_key:
98 critical_secrets_missing.append("ANTHROPIC_API_KEY")
99 if not settings.jwt_secret_key:
100 critical_secrets_missing.append("JWT_SECRET_KEY")
102 if critical_secrets_missing:
103 checks["secrets"] = {"status": "unhealthy", "missing": ", ".join(critical_secrets_missing)}
104 all_healthy = False
105 else:
106 checks["secrets"] = {"status": "healthy", "message": "All critical secrets loaded"}
108 response_status = "ready" if all_healthy else "not_ready"
109 http_status = status.HTTP_200_OK if all_healthy else status.HTTP_503_SERVICE_UNAVAILABLE
111 return JSONResponse(
112 status_code=http_status,
113 content=HealthResponse(
114 status=response_status,
115 timestamp=datetime.now(UTC).isoformat(),
116 version=settings.service_version,
117 checks=checks,
118 ).model_dump(),
119 )
122@app.get("/startup", response_model=None)
123async def startup_check() -> JSONResponse | dict[str, Any]:
124 """
125 Startup probe - returns 200 when application has fully started
127 Used by Kubernetes to determine when to start liveness/readiness probes
129 NOTE: Mounted at /health in main app, so accessible at /health/startup
130 """
131 # Check if critical components are initialized
132 checks = {}
134 # Verify settings loaded
135 checks["config"] = {"status": "loaded", "service": settings.service_name}
137 # Verify logger initialized
138 try:
139 logger.info("Startup health check")
140 checks["logging"] = {"status": "initialized"}
141 except Exception as e:
142 checks["logging"] = {"status": "failed", "error": str(e)}
144 return JSONResponse(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"status": "starting", "checks": checks})
146 return {"status": "started", "timestamp": datetime.now(UTC).isoformat(), "checks": checks}
149@app.get("/metrics/prometheus")
150async def prometheus_metrics() -> dict[str, Any]:
151 """
152 Prometheus metrics endpoint
154 Exposes application metrics for scraping
155 """
156 # This would integrate with OpenTelemetry's Prometheus exporter
157 # For now, return basic info
158 return {
159 "metrics": [
160 "# HELP langgraph_agent_info Application information",
161 "# TYPE langgraph_agent_info gauge",
162 f'langgraph_agent_info{{version="{settings.service_version}",service="{settings.service_name}"}} 1',
163 ]
164 }
167if __name__ == "__main__":
168 import uvicorn
170 # Bind to all interfaces for Docker/Kubernetes compatibility
171 uvicorn.run(
172 app,
173 host="0.0.0.0", # nosec B104 - Required for containerized deployment
174 port=int(settings.get_secret("HEALTH_PORT", fallback="8000") or "8000"),
175 log_level="info",
176 )