Coverage for src/mcp_server_langgraph/health/checks.py: 100%

1"""

2Health check endpoints for Kubernetes probes

3"""

5from datetime import datetime, UTC

6from typing import Any

8from fastapi import FastAPI, status

9from fastapi.responses import JSONResponse

10from pydantic import BaseModel

12from mcp_server_langgraph.auth.openfga import OpenFGAClient

13from mcp_server_langgraph.core.config import settings

14from mcp_server_langgraph.observability.telemetry import logger

15from mcp_server_langgraph.secrets.manager import get_secrets_manager

17app = FastAPI(title="MCP Server with LangGraph Health")

20class HealthResponse(BaseModel):

21 """Health check response model"""

23 status: str

24 timestamp: str

25 version: str

26 checks: dict[str, Any]

29@app.get("/")

30async def health_check() -> HealthResponse:

31 """

32 Liveness probe - returns 200 if application is running

34 Used by Kubernetes to determine if pod should be restarted

36 NOTE: Mounted at /health in main app, so accessible at /health/

37 """

38 return HealthResponse(

39 status="healthy",

40 timestamp=datetime.now(UTC).isoformat(),

41 version=settings.service_version,

42 checks={"application": "running"},

43 )

46@app.get("/live")

47async def liveness_check() -> HealthResponse:

48 """

49 Liveness probe - same as root health check

51 Used by Kubernetes liveness probe at /health/live

52 """

53 return await health_check()

56@app.get("/ready", response_model=None)

57async def readiness_check() -> JSONResponse:

58 """

59 Readiness probe - returns 200 if application can serve traffic

61 Used by Kubernetes to determine if pod should receive traffic

62 """

63 checks = {}

64 all_healthy = True

66 # Check OpenFGA connection

67 if settings.openfga_store_id and settings.openfga_model_id:

68 try:

69 OpenFGAClient(

70 api_url=settings.openfga_api_url, store_id=settings.openfga_store_id, model_id=settings.openfga_model_id

71 )

72 # Simple check - if client initializes, connection is OK

73 checks["openfga"] = {"status": "healthy", "url": settings.openfga_api_url}

74 except Exception as e:

75 checks["openfga"] = {"status": "unhealthy", "error": str(e)}

76 all_healthy = False

77 logger.error(f"OpenFGA health check failed: {e}")

78 else:

79 checks["openfga"] = {"status": "not_configured", "message": "OpenFGA not configured"}

81 # Check Infisical connection (optional)

82 try:

83 secrets_mgr = get_secrets_manager()

84 if secrets_mgr.client:

85 # Test secret retrieval

86 secrets_mgr.get_secret("HEALTH_CHECK_TEST", fallback="ok")

87 checks["infisical"] = {"status": "healthy", "url": settings.infisical_site_url}

88 else:

89 checks["infisical"] = {"status": "not_configured", "message": "Using environment variables"}

90 except Exception as e:

91 checks["infisical"] = {"status": "degraded", "message": "Fallback mode active", "error": str(e)}

92 # Don't fail readiness if Infisical is down (we have fallback)

93 logger.warning(f"Infisical health check failed: {e}")

95 # Check critical secrets exist

96 critical_secrets_missing = []

97 if not settings.anthropic_api_key:

98 critical_secrets_missing.append("ANTHROPIC_API_KEY")

99 if not settings.jwt_secret_key:

100 critical_secrets_missing.append("JWT_SECRET_KEY")

101

102 if critical_secrets_missing:

103 checks["secrets"] = {"status": "unhealthy", "missing": ", ".join(critical_secrets_missing)}

104 all_healthy = False

105 else:

106 checks["secrets"] = {"status": "healthy", "message": "All critical secrets loaded"}

107

108 response_status = "ready" if all_healthy else "not_ready"

109 http_status = status.HTTP_200_OK if all_healthy else status.HTTP_503_SERVICE_UNAVAILABLE

110

111 return JSONResponse(

112 status_code=http_status,

113 content=HealthResponse(

114 status=response_status,

115 timestamp=datetime.now(UTC).isoformat(),

116 version=settings.service_version,

117 checks=checks,

118 ).model_dump(),

119 )

120

121

122@app.get("/startup", response_model=None)

123async def startup_check() -> JSONResponse | dict[str, Any]:

124 """

125 Startup probe - returns 200 when application has fully started

126

127 Used by Kubernetes to determine when to start liveness/readiness probes

128

129 NOTE: Mounted at /health in main app, so accessible at /health/startup

130 """

131 # Check if critical components are initialized

132 checks = {}

133

134 # Verify settings loaded

135 checks["config"] = {"status": "loaded", "service": settings.service_name}

136

137 # Verify logger initialized

138 try:

139 logger.info("Startup health check")

140 checks["logging"] = {"status": "initialized"}

141 except Exception as e:

142 checks["logging"] = {"status": "failed", "error": str(e)}

143

144 return JSONResponse(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"status": "starting", "checks": checks})

145

146 return {"status": "started", "timestamp": datetime.now(UTC).isoformat(), "checks": checks}

147

148

149@app.get("/metrics/prometheus")

150async def prometheus_metrics() -> dict[str, Any]:

151 """

152 Prometheus metrics endpoint

153

154 Exposes application metrics for scraping

155 """

156 # This would integrate with OpenTelemetry's Prometheus exporter

157 # For now, return basic info

158 return {

159 "metrics": [

160 "# HELP langgraph_agent_info Application information",

161 "# TYPE langgraph_agent_info gauge",

162 f'langgraph_agent_info{{version="{settings.service_version}",service="{settings.service_name}"}} 1',

163 ]

164 }

165

166

167if __name__ == "__main__":

168 import uvicorn

169

170 # Bind to all interfaces for Docker/Kubernetes compatibility

171 uvicorn.run(

172 app,

173 host="0.0.0.0", # nosec B104 - Required for containerized deployment

174 port=int(settings.get_secret("HEALTH_PORT", fallback="8000") or "8000"),

175 log_level="info",

176 )

Coverage for src / mcp_server_langgraph / health / checks.py: 100%

69 statements