Coverage for src / mcp_server_langgraph / observability / telemetry.py: 57%

296 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-03 00:43 +0000

1""" 

2Unified observability setup with OpenTelemetry and LangSmith support 

3""" 

4 

5import logging 

6import os 

7import sys 

8from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler 

9from pathlib import Path 

10from typing import Any 

11 

12from opentelemetry import metrics as otel_metrics 

13from opentelemetry import trace 

14from opentelemetry.instrumentation.logging import LoggingInstrumentor 

15from opentelemetry.sdk.metrics import MeterProvider 

16from opentelemetry.sdk.metrics.export import ConsoleMetricExporter, PeriodicExportingMetricReader 

17from opentelemetry.sdk.resources import Resource 

18from opentelemetry.sdk.trace import TracerProvider 

19from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter 

20from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator 

21 

22# Conditional imports for OTLP exporters (optional dependencies) 

23try: 

24 from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as OTLPMetricExporterGRPC 

25 from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter as OTLPSpanExporterGRPC 

26 

27 GRPC_AVAILABLE = True 

28except ImportError: 

29 GRPC_AVAILABLE = False 

30 OTLPMetricExporterGRPC = None 

31 OTLPSpanExporterGRPC = None 

32 

33try: 

34 from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as OTLPMetricExporterHTTP 

35 from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter as OTLPSpanExporterHTTP 

36 

37 HTTP_AVAILABLE = True 

38except ImportError: 

39 HTTP_AVAILABLE = False 

40 OTLPMetricExporterHTTP = None # type: ignore[misc, assignment] 

41 OTLPSpanExporterHTTP = None # type: ignore[misc, assignment] 

42 

43from mcp_server_langgraph.observability.json_logger import CustomJSONFormatter 

44 

45# Configuration 

46SERVICE_NAME = "mcp-server-langgraph" 

47OTLP_ENDPOINT = "http://localhost:4317" # Change to your OTLP collector 

48 

49# Control verbose logging (defaults to False to reduce noise) 

50# Set OBSERVABILITY_VERBOSE=true to enable detailed initialization logs 

51OBSERVABILITY_VERBOSE = os.getenv("OBSERVABILITY_VERBOSE", "false").lower() in ("true", "1", "yes") 

52 

53 

54class ObservabilityConfig: 

55 """Centralized observability configuration with OpenTelemetry and LangSmith support""" 

56 

57 def __init__( 

58 self, 

59 service_name: str = SERVICE_NAME, 

60 otlp_endpoint: str = OTLP_ENDPOINT, 

61 enable_console_export: bool = True, 

62 enable_langsmith: bool = False, 

63 log_format: str = "json", # "json" or "text" 

64 log_json_indent: int | None = None, # None for compact, 2 for pretty-print 

65 enable_file_logging: bool = False, # NEW: opt-in file logging 

66 ): 

67 self.service_name = service_name 

68 self.otlp_endpoint = otlp_endpoint 

69 self.enable_console_export = enable_console_export 

70 self.enable_langsmith = enable_langsmith 

71 self.log_format = log_format 

72 self.log_json_indent = log_json_indent 

73 self.enable_file_logging = enable_file_logging 

74 

75 # Setup OpenTelemetry 

76 self._setup_tracing() 

77 self._setup_metrics() 

78 self._setup_logging(enable_file_logging=enable_file_logging) 

79 

80 # Setup LangSmith if enabled 

81 if self.enable_langsmith: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 self._setup_langsmith() 

83 

84 def _setup_tracing(self) -> None: 

85 """Configure distributed tracing""" 

86 # Get version from settings 

87 try: 

88 from mcp_server_langgraph.core.config import settings 

89 

90 service_version = settings.service_version 

91 except Exception: 

92 from mcp_server_langgraph import __version__ 

93 

94 service_version = __version__ 

95 

96 # Get actual environment from settings instead of hardcoding "production" 

97 try: 

98 from mcp_server_langgraph.core.config import settings as config_settings 

99 

100 environment = config_settings.environment 

101 except Exception: 

102 environment = "unknown" 

103 

104 resource = Resource.create( 

105 {"service.name": self.service_name, "service.version": service_version, "deployment.environment": environment} 

106 ) 

107 

108 provider = TracerProvider(resource=resource) 

109 

110 # OTLP exporter for production (if available) 

111 if GRPC_AVAILABLE and OTLPSpanExporterGRPC is not None: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 otlp_exporter = OTLPSpanExporterGRPC(endpoint=self.otlp_endpoint) 

113 provider.add_span_processor(BatchSpanProcessor(otlp_exporter)) 

114 elif HTTP_AVAILABLE and OTLPSpanExporterHTTP is not None: 114 ↛ 117line 114 didn't jump to line 117 because the condition on line 114 was always true

115 otlp_exporter = OTLPSpanExporterHTTP(endpoint=self.otlp_endpoint.replace(":4317", ":4318")) 

116 provider.add_span_processor(BatchSpanProcessor(otlp_exporter)) 

117 elif OBSERVABILITY_VERBOSE: 

118 print("⚠ OTLP exporters not available, using console-only tracing") 

119 

120 # Console exporter for development 

121 if self.enable_console_export: 121 ↛ 125line 121 didn't jump to line 125 because the condition on line 121 was always true

122 console_exporter = ConsoleSpanExporter() 

123 provider.add_span_processor(BatchSpanProcessor(console_exporter)) 

124 

125 trace.set_tracer_provider(provider) 

126 

127 self.tracer_provider = provider # Store provider for shutdown 

128 self.tracer = trace.get_tracer(__name__) 

129 if OBSERVABILITY_VERBOSE: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 print(f"✓ Tracing configured: {self.service_name}") 

131 

132 def _setup_metrics(self) -> None: 

133 """Configure metrics collection""" 

134 resource = Resource.create({"service.name": self.service_name}) 

135 

136 readers = [] 

137 

138 # OTLP metric exporter (if available) 

139 if GRPC_AVAILABLE and OTLPMetricExporterGRPC is not None: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 otlp_metric_exporter = OTLPMetricExporterGRPC(endpoint=self.otlp_endpoint) 

141 otlp_reader = PeriodicExportingMetricReader(otlp_metric_exporter, export_interval_millis=5000) 

142 readers.append(otlp_reader) 

143 elif HTTP_AVAILABLE and OTLPMetricExporterHTTP is not None: 143 ↛ 147line 143 didn't jump to line 147 because the condition on line 143 was always true

144 otlp_metric_exporter = OTLPMetricExporterHTTP(endpoint=self.otlp_endpoint.replace(":4317", ":4318")) 

145 otlp_reader = PeriodicExportingMetricReader(otlp_metric_exporter, export_interval_millis=5000) 

146 readers.append(otlp_reader) 

147 elif OBSERVABILITY_VERBOSE: 

148 print("⚠ OTLP exporters not available, using console-only metrics") 

149 

150 # Console exporter for development 

151 if self.enable_console_export: 151 ↛ 156line 151 didn't jump to line 156 because the condition on line 151 was always true

152 console_metric_exporter = ConsoleMetricExporter() 

153 console_reader = PeriodicExportingMetricReader(console_metric_exporter) 

154 readers.append(console_reader) 

155 

156 provider = MeterProvider(resource=resource, metric_readers=readers) 

157 

158 otel_metrics.set_meter_provider(provider) 

159 self.meter_provider = provider # Store provider for shutdown 

160 self.meter = otel_metrics.get_meter(__name__) 

161 

162 # Create common metrics 

163 self._create_metrics() 

164 if OBSERVABILITY_VERBOSE: 164 ↛ 165line 164 didn't jump to line 165 because the condition on line 164 was never true

165 print(f"✓ Metrics configured: {self.service_name}") 

166 

167 def _create_metrics(self) -> None: 

168 """Create standard metrics for the service""" 

169 self.tool_calls = self.meter.create_counter(name="agent.tool.calls", description="Number of tool calls", unit="1") 

170 

171 self.successful_calls = self.meter.create_counter( 

172 name="agent.calls.successful", description="Number of successful calls", unit="1" 

173 ) 

174 

175 self.failed_calls = self.meter.create_counter( 

176 name="agent.calls.failed", description="Number of failed calls", unit="1" 

177 ) 

178 

179 self.auth_failures = self.meter.create_counter( 

180 name="auth.failures", description="Number of authentication failures", unit="1" 

181 ) 

182 

183 self.authz_failures = self.meter.create_counter( 

184 name="authz.failures", description="Number of authorization failures", unit="1" 

185 ) 

186 

187 self.response_time = self.meter.create_histogram( 

188 name="agent.response.duration", description="Response time distribution", unit="ms" 

189 ) 

190 

191 # Resilience pattern metrics (ADR-0026) 

192 # Circuit breaker metrics 

193 self.circuit_breaker_state_gauge = self.meter.create_gauge( 

194 name="circuit_breaker.state", 

195 description="Circuit breaker state (0=closed, 1=open, 0.5=half-open)", 

196 unit="1", 

197 ) 

198 self.circuit_breaker_failure_counter = self.meter.create_counter( 

199 name="circuit_breaker.failures", 

200 description="Total circuit breaker failures", 

201 unit="1", 

202 ) 

203 self.circuit_breaker_success_counter = self.meter.create_counter( 

204 name="circuit_breaker.successes", 

205 description="Total circuit breaker successes", 

206 unit="1", 

207 ) 

208 

209 # Retry metrics 

210 self.retry_attempt_counter = self.meter.create_counter( 

211 name="retry.attempts", 

212 description="Total retry attempts", 

213 unit="1", 

214 ) 

215 self.retry_exhausted_counter = self.meter.create_counter( 

216 name="retry.exhausted", 

217 description="Total retry exhaustion events", 

218 unit="1", 

219 ) 

220 self.retry_success_after_retry_counter = self.meter.create_counter( 

221 name="retry.success_after_retry", 

222 description="Total successful retries", 

223 unit="1", 

224 ) 

225 

226 # Timeout metrics 

227 self.timeout_exceeded_counter = self.meter.create_counter( 

228 name="timeout.exceeded", 

229 description="Total timeout violations", 

230 unit="1", 

231 ) 

232 self.timeout_duration_histogram = self.meter.create_histogram( 

233 name="timeout.duration", 

234 description="Timeout duration in seconds", 

235 unit="s", 

236 ) 

237 

238 # Bulkhead metrics 

239 self.bulkhead_rejected_counter = self.meter.create_counter( 

240 name="bulkhead.rejections", 

241 description="Total bulkhead rejections", 

242 unit="1", 

243 ) 

244 self.bulkhead_active_operations_gauge = self.meter.create_gauge( 

245 name="bulkhead.active_operations", 

246 description="Current active operations in bulkhead", 

247 unit="1", 

248 ) 

249 self.bulkhead_queue_depth_gauge = self.meter.create_gauge( 

250 name="bulkhead.queue_depth", 

251 description="Current queued operations in bulkhead", 

252 unit="1", 

253 ) 

254 

255 # Fallback metrics 

256 self.fallback_used_counter = self.meter.create_counter( 

257 name="fallback.used", 

258 description="Total fallback invocations", 

259 unit="1", 

260 ) 

261 

262 # Error counter by type (for custom exceptions) 

263 self.error_counter = self.meter.create_counter( 

264 name="error.total", 

265 description="Total errors by type", 

266 unit="1", 

267 ) 

268 

269 # Code execution metrics 

270 self.code_executions = self.meter.create_counter( 

271 name="code.executions", 

272 description="Total code execution requests", 

273 unit="1", 

274 ) 

275 

276 def _setup_logging(self, enable_file_logging: bool = False) -> None: 

277 """ 

278 Configure structured logging with OpenTelemetry and optional log rotation. 

279 

280 Implements idempotent initialization to prevent duplicate handlers 

281 when re-imported or embedded in larger services. 

282 

283 Args: 

284 enable_file_logging: Enable file-based log rotation (opt-in). Default: False. 

285 Set to True for production deployments with persistent storage. 

286 Leave False for serverless, containers, or read-only environments. 

287 """ 

288 # Check if logging is already configured (idempotent guard) 

289 root_logger = logging.getLogger() 

290 if root_logger.handlers: 290 ↛ 298line 290 didn't jump to line 298 because the condition on line 290 was always true

291 # Logging already configured - skip to avoid duplicate handlers 

292 self.logger = logging.getLogger(self.service_name) 

293 if OBSERVABILITY_VERBOSE: 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true

294 print("✓ Logging already configured, reusing existing setup") 

295 return 

296 

297 # Instrument logging to include trace context 

298 LoggingInstrumentor().instrument(set_logging_format=True) 

299 

300 # Choose formatter based on log_format setting 

301 formatter: logging.Formatter 

302 console_formatter: logging.Formatter 

303 

304 if self.log_format == "json": 

305 # JSON formatter with trace context 

306 formatter = CustomJSONFormatter( 

307 service_name=self.service_name, 

308 include_hostname=True, 

309 indent=self.log_json_indent, 

310 ) 

311 # Console uses pretty-print in development, compact in production 

312 console_formatter = CustomJSONFormatter( 

313 service_name=self.service_name, 

314 include_hostname=False, # Skip hostname for console 

315 indent=2 if os.getenv("ENVIRONMENT", "development") == "development" else None, 

316 ) 

317 else: 

318 # Text formatter with trace context 

319 log_format_str = ( 

320 "%(asctime)s - %(name)s - %(levelname)s - [trace_id=%(otelTraceID)s span_id=%(otelSpanID)s] - %(message)s" 

321 ) 

322 formatter = logging.Formatter(log_format_str) 

323 console_formatter = formatter 

324 

325 # Console handler (stdout) - always enabled 

326 console_handler = logging.StreamHandler(sys.stdout) 

327 console_handler.setLevel(logging.INFO) 

328 console_handler.setFormatter(console_formatter) 

329 

330 handlers: list[logging.Handler] = [console_handler] 

331 

332 # File handlers - opt-in only 

333 if enable_file_logging: 

334 # Create logs directory if it doesn't exist 

335 logs_dir = Path("logs") 

336 logs_dir.mkdir(exist_ok=True) 

337 

338 # Rotating file handler (size-based rotation) 

339 # Rotates when file reaches 10MB, keeps 5 backup files 

340 rotating_handler = RotatingFileHandler( 

341 filename=logs_dir / f"{self.service_name}.log", 

342 maxBytes=10 * 1024 * 1024, # 10MB 

343 backupCount=5, 

344 encoding="utf-8", 

345 ) 

346 rotating_handler.setLevel(logging.INFO) 

347 rotating_handler.setFormatter(formatter) 

348 handlers.append(rotating_handler) 

349 

350 # Time-based rotating handler (daily rotation) 

351 # Rotates daily at midnight, keeps 30 days of logs 

352 daily_handler = TimedRotatingFileHandler( 

353 filename=logs_dir / f"{self.service_name}-daily.log", 

354 when="midnight", 

355 interval=1, 

356 backupCount=30, 

357 encoding="utf-8", 

358 ) 

359 daily_handler.setLevel(logging.INFO) 

360 daily_handler.setFormatter(formatter) 

361 handlers.append(daily_handler) 

362 

363 # Error log handler (only ERROR and CRITICAL) 

364 error_handler = RotatingFileHandler( 

365 filename=logs_dir / f"{self.service_name}-error.log", 

366 maxBytes=10 * 1024 * 1024, # 10MB 

367 backupCount=5, 

368 encoding="utf-8", 

369 ) 

370 error_handler.setLevel(logging.ERROR) 

371 error_handler.setFormatter(formatter) 

372 handlers.append(error_handler) 

373 

374 # Configure root logger 

375 logging.basicConfig(level=logging.INFO, handlers=handlers) 

376 

377 self.logger = logging.getLogger(self.service_name) 

378 if OBSERVABILITY_VERBOSE: 

379 print(f"✓ Logging configured: {self.service_name}") 

380 print(f" - Format: {self.log_format.upper()}") 

381 print(" - Console output: INFO and above") 

382 if enable_file_logging: 

383 print(f" - Main log: logs/{self.service_name}.log (rotating, 10MB, 5 backups)") 

384 print(f" - Daily log: logs/{self.service_name}-daily.log (daily, 30 days)") 

385 print(f" - Error log: logs/{self.service_name}-error.log (ERROR and above)") 

386 else: 

387 print(" - File logging: disabled (console only)") 

388 

389 def get_tracer(self) -> trace.Tracer: 

390 """Get tracer instance""" 

391 return self.tracer 

392 

393 def get_meter(self) -> otel_metrics.Meter: 

394 """Get meter instance""" 

395 return self.meter 

396 

397 def get_logger(self) -> logging.Logger: 

398 """Get logger instance""" 

399 return self.logger 

400 

401 def _setup_langsmith(self) -> None: 

402 """Configure LangSmith tracing""" 

403 try: 

404 from mcp_server_langgraph.observability.langsmith import langsmith_config 

405 

406 if langsmith_config.is_enabled(): 

407 if OBSERVABILITY_VERBOSE: 

408 print("✓ LangSmith integration enabled") 

409 print(" - Dual observability: OpenTelemetry + LangSmith") 

410 else: 

411 if OBSERVABILITY_VERBOSE: 

412 print("⚠ LangSmith configured but not enabled (check API key)") 

413 

414 except ImportError: 

415 if OBSERVABILITY_VERBOSE: 

416 print("⚠ LangSmith not available (install langsmith package)") 

417 except Exception as e: 

418 if OBSERVABILITY_VERBOSE: 

419 print(f"⚠ LangSmith setup failed: {e}") 

420 

421 

422# ============================================================================ 

423# Lazy Initialization System 

424# ============================================================================ 

425 

426_observability_config: ObservabilityConfig | None = None 

427_propagator: TraceContextTextMapPropagator | None = None 

428 

429 

430def is_initialized() -> bool: 

431 """Check if observability has been initialized.""" 

432 return _observability_config is not None 

433 

434 

435def init_observability( 

436 settings: Any | None = None, 

437 service_name: str = SERVICE_NAME, 

438 otlp_endpoint: str = OTLP_ENDPOINT, 

439 enable_console_export: bool = True, 

440 enable_langsmith: bool = False, 

441 log_format: str = "json", 

442 log_json_indent: int | None = None, 

443 enable_file_logging: bool = False, 

444) -> ObservabilityConfig: 

445 """ 

446 Initialize observability system (tracing, metrics, logging). 

447 

448 MUST be called explicitly by entry points after configuration is loaded. 

449 This prevents circular imports and filesystem operations during module import. 

450 

451 Args: 

452 settings: Settings object (optional, will use params if not provided) 

453 service_name: Service identifier 

454 otlp_endpoint: OpenTelemetry collector endpoint 

455 enable_console_export: Export to console for development 

456 enable_langsmith: Enable LangSmith integration 

457 log_format: "json" or "text" 

458 log_json_indent: JSON indent for pretty-printing (None for compact) 

459 enable_file_logging: Enable file-based log rotation (opt-in) 

460 

461 Returns: 

462 Initialized ObservabilityConfig instance 

463 

464 Example: 

465 >>> from mcp_server_langgraph.core.config import settings 

466 >>> from mcp_server_langgraph.observability.telemetry import init_observability 

467 >>> config = init_observability(settings, enable_file_logging=True) 

468 """ 

469 global _observability_config, _propagator 

470 

471 if _observability_config is not None: 

472 # Already initialized - return existing config 

473 return _observability_config 

474 

475 # Extract settings if provided 

476 if settings is not None: 476 ↛ 485line 476 didn't jump to line 485 because the condition on line 476 was always true

477 enable_langsmith = settings.langsmith_tracing and settings.observability_backend in ("langsmith", "both") 

478 log_format = getattr(settings, "log_format", "json") 

479 log_json_indent = getattr(settings, "log_json_indent", None) 

480 otlp_endpoint = getattr(settings, "otlp_endpoint", OTLP_ENDPOINT) 

481 enable_console_export = getattr(settings, "enable_console_export", True) 

482 # enable_file_logging can be overridden via settings 

483 enable_file_logging = getattr(settings, "enable_file_logging", enable_file_logging) 

484 

485 _observability_config = ObservabilityConfig( 

486 service_name=service_name, 

487 otlp_endpoint=otlp_endpoint, 

488 enable_console_export=enable_console_export, 

489 enable_langsmith=enable_langsmith, 

490 log_format=log_format, 

491 log_json_indent=log_json_indent, 

492 enable_file_logging=enable_file_logging, 

493 ) 

494 

495 _propagator = TraceContextTextMapPropagator() 

496 

497 return _observability_config 

498 

499 

500def get_config() -> ObservabilityConfig: 

501 """ 

502 Get observability config (lazy accessor). 

503 

504 Raises RuntimeError if not initialized. 

505 """ 

506 if _observability_config is None: 506 ↛ 507line 506 didn't jump to line 507 because the condition on line 506 was never true

507 msg = ( 

508 "Observability not initialized. Call init_observability(settings) " 

509 "in your entry point before using observability features." 

510 ) 

511 raise RuntimeError(msg) 

512 return _observability_config 

513 

514 

515def shutdown_observability() -> None: 

516 """ 

517 Shutdown observability system and flush all pending telemetry data. 

518 

519 This function should be called during application shutdown to ensure: 

520 - All pending spans are flushed to exporters 

521 - All pending metrics are exported 

522 - HTTP connections to collectors are closed gracefully 

523 - No telemetry data is lost 

524 

525 Usage: 

526 - Call from FastAPI lifespan shutdown event 

527 - Register with atexit.register() as fallback 

528 - Call in exception handlers before exit 

529 

530 Thread Safety: 

531 This function is NOT thread-safe. Call only during shutdown when 

532 no new telemetry is being generated. 

533 """ 

534 global _observability_config 

535 

536 if _observability_config is None: 

537 return # Nothing to shutdown 

538 

539 try: 

540 # Flush tracer spans 

541 if hasattr(_observability_config, "tracer_provider"): 541 ↛ 555line 541 didn't jump to line 555 because the condition on line 541 was always true

542 tracer_provider = _observability_config.tracer_provider 

543 if hasattr(tracer_provider, "force_flush"): 543 ↛ 549line 543 didn't jump to line 549 because the condition on line 543 was always true

544 tracer_provider.force_flush(timeout_millis=5000) 

545 if OBSERVABILITY_VERBOSE: 545 ↛ 546line 545 didn't jump to line 546 because the condition on line 545 was never true

546 print("✅ Flushed trace spans", file=sys.stderr) 

547 

548 # Shutdown span processors 

549 if hasattr(tracer_provider, "shutdown"): 549 ↛ 555line 549 didn't jump to line 555 because the condition on line 549 was always true

550 tracer_provider.shutdown() 

551 if OBSERVABILITY_VERBOSE: 551 ↛ 552line 551 didn't jump to line 552 because the condition on line 551 was never true

552 print("✅ Shutdown tracer provider", file=sys.stderr) 

553 

554 # Flush and shutdown meter provider 

555 if hasattr(_observability_config, "meter_provider"): 555 ↛ 567line 555 didn't jump to line 567 because the condition on line 555 was always true

556 meter_provider = _observability_config.meter_provider 

557 if hasattr(meter_provider, "force_flush"): 557 ↛ 562line 557 didn't jump to line 562 because the condition on line 557 was always true

558 meter_provider.force_flush(timeout_millis=5000) 

559 if OBSERVABILITY_VERBOSE: 559 ↛ 560line 559 didn't jump to line 560 because the condition on line 559 was never true

560 print("✅ Flushed metrics", file=sys.stderr) 

561 

562 if hasattr(meter_provider, "shutdown"): 562 ↛ 567line 562 didn't jump to line 567 because the condition on line 562 was always true

563 meter_provider.shutdown() 

564 if OBSERVABILITY_VERBOSE: 564 ↛ 565line 564 didn't jump to line 565 because the condition on line 564 was never true

565 print("✅ Shutdown meter provider", file=sys.stderr) 

566 

567 if OBSERVABILITY_VERBOSE: 567 ↛ 568line 567 didn't jump to line 568 because the condition on line 567 was never true

568 print("✅ Observability system shutdown complete", file=sys.stderr) 

569 

570 except Exception as e: 

571 # Log error but don't raise - shutdown should be graceful 

572 print(f"⚠️ Error during observability shutdown: {e}", file=sys.stderr) 

573 

574 finally: 

575 _observability_config = None # Mark as shutdown 

576 

577 

578# Note: config is available via get_config() function or via the lazy 'config' proxy below 

579 

580 

581def get_tracer() -> Any: 

582 """ 

583 Get tracer instance (lazy accessor with safe fallback). 

584 

585 Returns the configured tracer if observability is initialized, 

586 otherwise returns a no-op tracer that doesn't raise errors. 

587 

588 Returns: 

589 Tracer instance (either ObservabilityConfig tracer or no-op tracer) 

590 """ 

591 if _observability_config is None: 591 ↛ 593line 591 didn't jump to line 593 because the condition on line 591 was never true

592 # Return no-op tracer if observability not initialized 

593 from opentelemetry.trace import get_tracer as get_noop_tracer 

594 

595 return get_noop_tracer(__name__) 

596 return get_config().get_tracer() 

597 

598 

599def get_meter() -> Any: 

600 """ 

601 Get meter instance (lazy accessor with safe fallback). 

602 

603 Returns the configured meter if observability is initialized, 

604 otherwise returns a no-op meter that doesn't raise errors. 

605 

606 Returns: 

607 Meter instance (either ObservabilityConfig meter or no-op meter) 

608 """ 

609 if _observability_config is None: 

610 # Return no-op meter if observability not initialized 

611 from opentelemetry.metrics import get_meter as get_noop_meter 

612 

613 return get_noop_meter(__name__) 

614 return get_config().get_meter() 

615 

616 

617def get_logger() -> Any: 

618 """ 

619 Get logger instance (lazy accessor with safe fallback). 

620 

621 Returns the configured logger if observability is initialized, 

622 otherwise returns a basic Python logger that logs to stderr. 

623 

624 This prevents RuntimeError when functions like create_user_provider() 

625 or create_session_store() are called before init_observability() 

626 (e.g., in test fixtures or standalone scripts). 

627 

628 Returns: 

629 Logger instance (either ObservabilityConfig logger or fallback logger) 

630 """ 

631 if _observability_config is None: 631 ↛ 634line 631 didn't jump to line 634 because the condition on line 631 was never true

632 # Return fallback logger if observability not initialized 

633 # Uses standard Python logging to stderr 

634 fallback_logger = logging.getLogger("mcp-server-langgraph-fallback") 

635 if not fallback_logger.handlers: 

636 # Configure fallback logger on first use 

637 handler = logging.StreamHandler() 

638 handler.setFormatter(logging.Formatter("%(levelname)s:%(name)s:%(message)s")) 

639 fallback_logger.addHandler(handler) 

640 fallback_logger.setLevel(logging.INFO) 

641 return fallback_logger 

642 return get_config().get_logger() 

643 

644 

645# Module-level exports with lazy initialization 

646# These will raise RuntimeError if accessed before init_observability() 

647tracer = type("LazyTracer", (), {"__getattr__": lambda self, name: getattr(get_tracer(), name)})() 

648 

649meter = type("LazyMeter", (), {"__getattr__": lambda self, name: getattr(get_meter(), name)})() 

650 

651logger = type("LazyLogger", (), {"__getattr__": lambda self, name: getattr(get_logger(), name)})() 

652 

653# Alias for backward compatibility - provides access to both config and metric instruments 

654config = type("LazyConfig", (), {"__getattr__": lambda self, name: getattr(get_config(), name)})() 

655 

656metrics = config # metrics is an alias for config 

657 

658 

659# Context propagation utilities 

660def inject_context(carrier: dict[str, str]) -> None: 

661 """Inject trace context into carrier (e.g., HTTP headers)""" 

662 if _propagator is None: 

663 msg = "Observability not initialized. Call init_observability() first." 

664 raise RuntimeError(msg) 

665 _propagator.inject(carrier) 

666 

667 

668def extract_context(carrier: dict[str, str]) -> Any: 

669 """Extract trace context from carrier""" 

670 if _propagator is None: 

671 msg = "Observability not initialized. Call init_observability() first." 

672 raise RuntimeError(msg) 

673 return _propagator.extract(carrier) 

674 

675 

676# Resilience pattern metrics (convenient exports for resilience module) 

677# These are lazy proxies that safely no-op if observability not initialized 

678 

679 

680def _safe_metric_operation(metric_name: str, operation: str, *args: Any, **kwargs: Any) -> None: 

681 """ 

682 Safely perform metric operation, no-op if observability not initialized. 

683 

684 This allows resilience patterns to work even before observability is initialized, 

685 which is common in tests and during application startup. 

686 """ 

687 if _observability_config is None: 

688 # Observability not initialized - no-op 

689 return 

690 try: 

691 metric = getattr(_observability_config, metric_name) 

692 getattr(metric, operation)(*args, **kwargs) 

693 except Exception: 

694 # Silently ignore metric errors to prevent breaking application logic 

695 pass 

696 

697 

698circuit_breaker_state_gauge = type( 

699 "LazyMetric", 

700 (), 

701 { 

702 "add": lambda self, *args, **kwargs: _safe_metric_operation("circuit_breaker_state_gauge", "add", *args, **kwargs), 

703 "set": lambda self, *args, **kwargs: _safe_metric_operation("circuit_breaker_state_gauge", "set", *args, **kwargs), 

704 }, 

705)() 

706 

707circuit_breaker_failure_counter = type( 

708 "LazyMetric", 

709 (), 

710 {"add": lambda self, *args, **kwargs: _safe_metric_operation("circuit_breaker_failure_counter", "add", *args, **kwargs)}, 

711)() 

712 

713circuit_breaker_success_counter = type( 

714 "LazyMetric", 

715 (), 

716 {"add": lambda self, *args, **kwargs: _safe_metric_operation("circuit_breaker_success_counter", "add", *args, **kwargs)}, 

717)() 

718 

719retry_attempt_counter = type( 

720 "LazyMetric", 

721 (), 

722 {"add": lambda self, *args, **kwargs: _safe_metric_operation("retry_attempt_counter", "add", *args, **kwargs)}, 

723)() 

724 

725retry_exhausted_counter = type( 

726 "LazyMetric", 

727 (), 

728 {"add": lambda self, *args, **kwargs: _safe_metric_operation("retry_exhausted_counter", "add", *args, **kwargs)}, 

729)() 

730 

731retry_success_after_retry_counter = type( 

732 "LazyMetric", 

733 (), 

734 {"add": lambda self, *args, **kwargs: _safe_metric_operation("retry_success_after_retry_counter", "add", *args, **kwargs)}, 

735)() 

736 

737timeout_exceeded_counter = type( 

738 "LazyMetric", 

739 (), 

740 {"add": lambda self, *args, **kwargs: _safe_metric_operation("timeout_exceeded_counter", "add", *args, **kwargs)}, 

741)() 

742 

743timeout_duration_histogram = type( 

744 "LazyMetric", 

745 (), 

746 {"record": lambda self, *args, **kwargs: _safe_metric_operation("timeout_duration_histogram", "record", *args, **kwargs)}, 

747)() 

748 

749bulkhead_rejected_counter = type( 

750 "LazyMetric", 

751 (), 

752 {"add": lambda self, *args, **kwargs: _safe_metric_operation("bulkhead_rejected_counter", "add", *args, **kwargs)}, 

753)() 

754 

755bulkhead_active_operations_gauge = type( 

756 "LazyMetric", 

757 (), 

758 {"set": lambda self, *args, **kwargs: _safe_metric_operation("bulkhead_active_operations_gauge", "set", *args, **kwargs)}, 

759)() 

760 

761bulkhead_queue_depth_gauge = type( 

762 "LazyMetric", 

763 (), 

764 {"set": lambda self, *args, **kwargs: _safe_metric_operation("bulkhead_queue_depth_gauge", "set", *args, **kwargs)}, 

765)() 

766 

767fallback_used_counter = type( 

768 "LazyMetric", 

769 (), 

770 {"add": lambda self, *args, **kwargs: _safe_metric_operation("fallback_used_counter", "add", *args, **kwargs)}, 

771)() 

772 

773error_counter = type( 

774 "LazyMetric", (), {"add": lambda self, *args, **kwargs: _safe_metric_operation("error_counter", "add", *args, **kwargs)} 

775)()