Coverage for src / mcp_server_langgraph / health / checks.py: 100%

69 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-03 00:43 +0000

1""" 

2Health check endpoints for Kubernetes probes 

3""" 

4 

5from datetime import datetime, UTC 

6from typing import Any 

7 

8from fastapi import FastAPI, status 

9from fastapi.responses import JSONResponse 

10from pydantic import BaseModel 

11 

12from mcp_server_langgraph.auth.openfga import OpenFGAClient 

13from mcp_server_langgraph.core.config import settings 

14from mcp_server_langgraph.observability.telemetry import logger 

15from mcp_server_langgraph.secrets.manager import get_secrets_manager 

16 

17app = FastAPI(title="MCP Server with LangGraph Health") 

18 

19 

20class HealthResponse(BaseModel): 

21 """Health check response model""" 

22 

23 status: str 

24 timestamp: str 

25 version: str 

26 checks: dict[str, Any] 

27 

28 

29@app.get("/") 

30async def health_check() -> HealthResponse: 

31 """ 

32 Liveness probe - returns 200 if application is running 

33 

34 Used by Kubernetes to determine if pod should be restarted 

35 

36 NOTE: Mounted at /health in main app, so accessible at /health/ 

37 """ 

38 return HealthResponse( 

39 status="healthy", 

40 timestamp=datetime.now(UTC).isoformat(), 

41 version=settings.service_version, 

42 checks={"application": "running"}, 

43 ) 

44 

45 

46@app.get("/live") 

47async def liveness_check() -> HealthResponse: 

48 """ 

49 Liveness probe - same as root health check 

50 

51 Used by Kubernetes liveness probe at /health/live 

52 """ 

53 return await health_check() 

54 

55 

56@app.get("/ready", response_model=None) 

57async def readiness_check() -> JSONResponse: 

58 """ 

59 Readiness probe - returns 200 if application can serve traffic 

60 

61 Used by Kubernetes to determine if pod should receive traffic 

62 """ 

63 checks = {} 

64 all_healthy = True 

65 

66 # Check OpenFGA connection 

67 if settings.openfga_store_id and settings.openfga_model_id: 

68 try: 

69 OpenFGAClient( 

70 api_url=settings.openfga_api_url, store_id=settings.openfga_store_id, model_id=settings.openfga_model_id 

71 ) 

72 # Simple check - if client initializes, connection is OK 

73 checks["openfga"] = {"status": "healthy", "url": settings.openfga_api_url} 

74 except Exception as e: 

75 checks["openfga"] = {"status": "unhealthy", "error": str(e)} 

76 all_healthy = False 

77 logger.error(f"OpenFGA health check failed: {e}") 

78 else: 

79 checks["openfga"] = {"status": "not_configured", "message": "OpenFGA not configured"} 

80 

81 # Check Infisical connection (optional) 

82 try: 

83 secrets_mgr = get_secrets_manager() 

84 if secrets_mgr.client: 

85 # Test secret retrieval 

86 secrets_mgr.get_secret("HEALTH_CHECK_TEST", fallback="ok") 

87 checks["infisical"] = {"status": "healthy", "url": settings.infisical_site_url} 

88 else: 

89 checks["infisical"] = {"status": "not_configured", "message": "Using environment variables"} 

90 except Exception as e: 

91 checks["infisical"] = {"status": "degraded", "message": "Fallback mode active", "error": str(e)} 

92 # Don't fail readiness if Infisical is down (we have fallback) 

93 logger.warning(f"Infisical health check failed: {e}") 

94 

95 # Check critical secrets exist 

96 critical_secrets_missing = [] 

97 if not settings.anthropic_api_key: 

98 critical_secrets_missing.append("ANTHROPIC_API_KEY") 

99 if not settings.jwt_secret_key: 

100 critical_secrets_missing.append("JWT_SECRET_KEY") 

101 

102 if critical_secrets_missing: 

103 checks["secrets"] = {"status": "unhealthy", "missing": ", ".join(critical_secrets_missing)} 

104 all_healthy = False 

105 else: 

106 checks["secrets"] = {"status": "healthy", "message": "All critical secrets loaded"} 

107 

108 response_status = "ready" if all_healthy else "not_ready" 

109 http_status = status.HTTP_200_OK if all_healthy else status.HTTP_503_SERVICE_UNAVAILABLE 

110 

111 return JSONResponse( 

112 status_code=http_status, 

113 content=HealthResponse( 

114 status=response_status, 

115 timestamp=datetime.now(UTC).isoformat(), 

116 version=settings.service_version, 

117 checks=checks, 

118 ).model_dump(), 

119 ) 

120 

121 

122@app.get("/startup", response_model=None) 

123async def startup_check() -> JSONResponse | dict[str, Any]: 

124 """ 

125 Startup probe - returns 200 when application has fully started 

126 

127 Used by Kubernetes to determine when to start liveness/readiness probes 

128 

129 NOTE: Mounted at /health in main app, so accessible at /health/startup 

130 """ 

131 # Check if critical components are initialized 

132 checks = {} 

133 

134 # Verify settings loaded 

135 checks["config"] = {"status": "loaded", "service": settings.service_name} 

136 

137 # Verify logger initialized 

138 try: 

139 logger.info("Startup health check") 

140 checks["logging"] = {"status": "initialized"} 

141 except Exception as e: 

142 checks["logging"] = {"status": "failed", "error": str(e)} 

143 

144 return JSONResponse(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"status": "starting", "checks": checks}) 

145 

146 return {"status": "started", "timestamp": datetime.now(UTC).isoformat(), "checks": checks} 

147 

148 

149@app.get("/metrics/prometheus") 

150async def prometheus_metrics() -> dict[str, Any]: 

151 """ 

152 Prometheus metrics endpoint 

153 

154 Exposes application metrics for scraping 

155 """ 

156 # This would integrate with OpenTelemetry's Prometheus exporter 

157 # For now, return basic info 

158 return { 

159 "metrics": [ 

160 "# HELP langgraph_agent_info Application information", 

161 "# TYPE langgraph_agent_info gauge", 

162 f'langgraph_agent_info{{version="{settings.service_version}",service="{settings.service_name}"}} 1', 

163 ] 

164 } 

165 

166 

167if __name__ == "__main__": 

168 import uvicorn 

169 

170 # Bind to all interfaces for Docker/Kubernetes compatibility 

171 uvicorn.run( 

172 app, 

173 host="0.0.0.0", # nosec B104 - Required for containerized deployment 

174 port=int(settings.get_secret("HEALTH_PORT", fallback="8000") or "8000"), 

175 log_level="info", 

176 )