Coverage for src/mcp_server_langgraph/observability/telemetry.py: 57%

1"""

2Unified observability setup with OpenTelemetry and LangSmith support

3"""

5import logging

6import os

7import sys

8from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler

9from pathlib import Path

10from typing import Any

12from opentelemetry import metrics as otel_metrics

13from opentelemetry import trace

14from opentelemetry.instrumentation.logging import LoggingInstrumentor

15from opentelemetry.sdk.metrics import MeterProvider

16from opentelemetry.sdk.metrics.export import ConsoleMetricExporter, PeriodicExportingMetricReader

17from opentelemetry.sdk.resources import Resource

18from opentelemetry.sdk.trace import TracerProvider

19from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter

20from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator

22# Conditional imports for OTLP exporters (optional dependencies)

23try:

24 from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as OTLPMetricExporterGRPC

25 from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter as OTLPSpanExporterGRPC

27 GRPC_AVAILABLE = True

28except ImportError:

29 GRPC_AVAILABLE = False

30 OTLPMetricExporterGRPC = None

31 OTLPSpanExporterGRPC = None

33try:

34 from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as OTLPMetricExporterHTTP

35 from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter as OTLPSpanExporterHTTP

37 HTTP_AVAILABLE = True

38except ImportError:

39 HTTP_AVAILABLE = False

40 OTLPMetricExporterHTTP = None # type: ignore[misc, assignment]

41 OTLPSpanExporterHTTP = None # type: ignore[misc, assignment]

43from mcp_server_langgraph.observability.json_logger import CustomJSONFormatter

45# Configuration

46SERVICE_NAME = "mcp-server-langgraph"

47OTLP_ENDPOINT = "http://localhost:4317" # Change to your OTLP collector

49# Control verbose logging (defaults to False to reduce noise)

50# Set OBSERVABILITY_VERBOSE=true to enable detailed initialization logs

51OBSERVABILITY_VERBOSE = os.getenv("OBSERVABILITY_VERBOSE", "false").lower() in ("true", "1", "yes")

54class ObservabilityConfig:

55 """Centralized observability configuration with OpenTelemetry and LangSmith support"""

57 def __init__(

58 self,

59 service_name: str = SERVICE_NAME,

60 otlp_endpoint: str = OTLP_ENDPOINT,

61 enable_console_export: bool = True,

62 enable_langsmith: bool = False,

63 log_format: str = "json", # "json" or "text"

64 log_json_indent: int | None = None, # None for compact, 2 for pretty-print

65 enable_file_logging: bool = False, # NEW: opt-in file logging

66 ):

67 self.service_name = service_name

68 self.otlp_endpoint = otlp_endpoint

69 self.enable_console_export = enable_console_export

70 self.enable_langsmith = enable_langsmith

71 self.log_format = log_format

72 self.log_json_indent = log_json_indent

73 self.enable_file_logging = enable_file_logging

75 # Setup OpenTelemetry

76 self._setup_tracing()

77 self._setup_metrics()

78 self._setup_logging(enable_file_logging=enable_file_logging)

80 # Setup LangSmith if enabled

81 if self.enable_langsmith: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 self._setup_langsmith()

84 def _setup_tracing(self) -> None:

85 """Configure distributed tracing"""

86 # Get version from settings

87 try:

88 from mcp_server_langgraph.core.config import settings

90 service_version = settings.service_version

91 except Exception:

92 from mcp_server_langgraph import __version__

94 service_version = __version__

96 # Get actual environment from settings instead of hardcoding "production"

97 try:

98 from mcp_server_langgraph.core.config import settings as config_settings

100 environment = config_settings.environment

101 except Exception:

102 environment = "unknown"

103

104 resource = Resource.create(

105 {"service.name": self.service_name, "service.version": service_version, "deployment.environment": environment}

106 )

107

108 provider = TracerProvider(resource=resource)

109

110 # OTLP exporter for production (if available)

111 if GRPC_AVAILABLE and OTLPSpanExporterGRPC is not None: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 otlp_exporter = OTLPSpanExporterGRPC(endpoint=self.otlp_endpoint)

113 provider.add_span_processor(BatchSpanProcessor(otlp_exporter))

114 elif HTTP_AVAILABLE and OTLPSpanExporterHTTP is not None: 114 ↛ 117line 114 didn't jump to line 117 because the condition on line 114 was always true

115 otlp_exporter = OTLPSpanExporterHTTP(endpoint=self.otlp_endpoint.replace(":4317", ":4318"))

116 provider.add_span_processor(BatchSpanProcessor(otlp_exporter))

117 elif OBSERVABILITY_VERBOSE:

118 print("⚠ OTLP exporters not available, using console-only tracing")

119

120 # Console exporter for development

121 if self.enable_console_export: 121 ↛ 125line 121 didn't jump to line 125 because the condition on line 121 was always true

122 console_exporter = ConsoleSpanExporter()

123 provider.add_span_processor(BatchSpanProcessor(console_exporter))

124

125 trace.set_tracer_provider(provider)

126

127 self.tracer_provider = provider # Store provider for shutdown

128 self.tracer = trace.get_tracer(__name__)

129 if OBSERVABILITY_VERBOSE: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 print(f"✓ Tracing configured: {self.service_name}")

131

132 def _setup_metrics(self) -> None:

133 """Configure metrics collection"""

134 resource = Resource.create({"service.name": self.service_name})

135

136 readers = []

137

138 # OTLP metric exporter (if available)

139 if GRPC_AVAILABLE and OTLPMetricExporterGRPC is not None: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 otlp_metric_exporter = OTLPMetricExporterGRPC(endpoint=self.otlp_endpoint)

141 otlp_reader = PeriodicExportingMetricReader(otlp_metric_exporter, export_interval_millis=5000)

142 readers.append(otlp_reader)

143 elif HTTP_AVAILABLE and OTLPMetricExporterHTTP is not None: 143 ↛ 147line 143 didn't jump to line 147 because the condition on line 143 was always true

144 otlp_metric_exporter = OTLPMetricExporterHTTP(endpoint=self.otlp_endpoint.replace(":4317", ":4318"))

145 otlp_reader = PeriodicExportingMetricReader(otlp_metric_exporter, export_interval_millis=5000)

146 readers.append(otlp_reader)

147 elif OBSERVABILITY_VERBOSE:

148 print("⚠ OTLP exporters not available, using console-only metrics")

149

150 # Console exporter for development

151 if self.enable_console_export: 151 ↛ 156line 151 didn't jump to line 156 because the condition on line 151 was always true

152 console_metric_exporter = ConsoleMetricExporter()

153 console_reader = PeriodicExportingMetricReader(console_metric_exporter)

154 readers.append(console_reader)

155

156 provider = MeterProvider(resource=resource, metric_readers=readers)

157

158 otel_metrics.set_meter_provider(provider)

159 self.meter_provider = provider # Store provider for shutdown

160 self.meter = otel_metrics.get_meter(__name__)

161

162 # Create common metrics

163 self._create_metrics()

164 if OBSERVABILITY_VERBOSE: 164 ↛ 165line 164 didn't jump to line 165 because the condition on line 164 was never true

165 print(f"✓ Metrics configured: {self.service_name}")

166

167 def _create_metrics(self) -> None:

168 """Create standard metrics for the service"""

169 self.tool_calls = self.meter.create_counter(name="agent.tool.calls", description="Number of tool calls", unit="1")

170

171 self.successful_calls = self.meter.create_counter(

172 name="agent.calls.successful", description="Number of successful calls", unit="1"

173 )

174

175 self.failed_calls = self.meter.create_counter(

176 name="agent.calls.failed", description="Number of failed calls", unit="1"

177 )

178

179 self.auth_failures = self.meter.create_counter(

180 name="auth.failures", description="Number of authentication failures", unit="1"

181 )

182

183 self.authz_failures = self.meter.create_counter(

184 name="authz.failures", description="Number of authorization failures", unit="1"

185 )

186

187 self.response_time = self.meter.create_histogram(

188 name="agent.response.duration", description="Response time distribution", unit="ms"

189 )

190

191 # Resilience pattern metrics (ADR-0026)

192 # Circuit breaker metrics

193 self.circuit_breaker_state_gauge = self.meter.create_gauge(

194 name="circuit_breaker.state",

195 description="Circuit breaker state (0=closed, 1=open, 0.5=half-open)",

196 unit="1",

197 )

198 self.circuit_breaker_failure_counter = self.meter.create_counter(

199 name="circuit_breaker.failures",

200 description="Total circuit breaker failures",

201 unit="1",

202 )

203 self.circuit_breaker_success_counter = self.meter.create_counter(

204 name="circuit_breaker.successes",

205 description="Total circuit breaker successes",

206 unit="1",

207 )

208

209 # Retry metrics

210 self.retry_attempt_counter = self.meter.create_counter(

211 name="retry.attempts",

212 description="Total retry attempts",

213 unit="1",

214 )

215 self.retry_exhausted_counter = self.meter.create_counter(

216 name="retry.exhausted",

217 description="Total retry exhaustion events",

218 unit="1",

219 )

220 self.retry_success_after_retry_counter = self.meter.create_counter(

221 name="retry.success_after_retry",

222 description="Total successful retries",

223 unit="1",

224 )

225

226 # Timeout metrics

227 self.timeout_exceeded_counter = self.meter.create_counter(

228 name="timeout.exceeded",

229 description="Total timeout violations",

230 unit="1",

231 )

232 self.timeout_duration_histogram = self.meter.create_histogram(

233 name="timeout.duration",

234 description="Timeout duration in seconds",

235 unit="s",

236 )

237

238 # Bulkhead metrics

239 self.bulkhead_rejected_counter = self.meter.create_counter(

240 name="bulkhead.rejections",

241 description="Total bulkhead rejections",

242 unit="1",

243 )

244 self.bulkhead_active_operations_gauge = self.meter.create_gauge(

245 name="bulkhead.active_operations",

246 description="Current active operations in bulkhead",

247 unit="1",

248 )

249 self.bulkhead_queue_depth_gauge = self.meter.create_gauge(

250 name="bulkhead.queue_depth",

251 description="Current queued operations in bulkhead",

252 unit="1",

253 )

254

255 # Fallback metrics

256 self.fallback_used_counter = self.meter.create_counter(

257 name="fallback.used",

258 description="Total fallback invocations",

259 unit="1",

260 )

261

262 # Error counter by type (for custom exceptions)

263 self.error_counter = self.meter.create_counter(

264 name="error.total",

265 description="Total errors by type",

266 unit="1",

267 )

268

269 # Code execution metrics

270 self.code_executions = self.meter.create_counter(

271 name="code.executions",

272 description="Total code execution requests",

273 unit="1",

274 )

275

276 def _setup_logging(self, enable_file_logging: bool = False) -> None:

277 """

278 Configure structured logging with OpenTelemetry and optional log rotation.

279

280 Implements idempotent initialization to prevent duplicate handlers

281 when re-imported or embedded in larger services.

282

283 Args:

284 enable_file_logging: Enable file-based log rotation (opt-in). Default: False.

285 Set to True for production deployments with persistent storage.

286 Leave False for serverless, containers, or read-only environments.

287 """

288 # Check if logging is already configured (idempotent guard)

289 root_logger = logging.getLogger()

290 if root_logger.handlers: 290 ↛ 298line 290 didn't jump to line 298 because the condition on line 290 was always true

291 # Logging already configured - skip to avoid duplicate handlers

292 self.logger = logging.getLogger(self.service_name)

293 if OBSERVABILITY_VERBOSE: 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true

294 print("✓ Logging already configured, reusing existing setup")

295 return

296

297 # Instrument logging to include trace context

298 LoggingInstrumentor().instrument(set_logging_format=True)

299

300 # Choose formatter based on log_format setting

301 formatter: logging.Formatter

302 console_formatter: logging.Formatter

303

304 if self.log_format == "json":

305 # JSON formatter with trace context

306 formatter = CustomJSONFormatter(

307 service_name=self.service_name,

308 include_hostname=True,

309 indent=self.log_json_indent,

310 )

311 # Console uses pretty-print in development, compact in production

312 console_formatter = CustomJSONFormatter(

313 service_name=self.service_name,

314 include_hostname=False, # Skip hostname for console

315 indent=2 if os.getenv("ENVIRONMENT", "development") == "development" else None,

316 )

317 else:

318 # Text formatter with trace context

319 log_format_str = (

320 "%(asctime)s - %(name)s - %(levelname)s - [trace_id=%(otelTraceID)s span_id=%(otelSpanID)s] - %(message)s"

321 )

322 formatter = logging.Formatter(log_format_str)

323 console_formatter = formatter

324

325 # Console handler (stdout) - always enabled

326 console_handler = logging.StreamHandler(sys.stdout)

327 console_handler.setLevel(logging.INFO)

328 console_handler.setFormatter(console_formatter)

329

330 handlers: list[logging.Handler] = [console_handler]

331

332 # File handlers - opt-in only

333 if enable_file_logging:

334 # Create logs directory if it doesn't exist

335 logs_dir = Path("logs")

336 logs_dir.mkdir(exist_ok=True)

337

338 # Rotating file handler (size-based rotation)

339 # Rotates when file reaches 10MB, keeps 5 backup files

340 rotating_handler = RotatingFileHandler(

341 filename=logs_dir / f"{self.service_name}.log",

342 maxBytes=10 * 1024 * 1024, # 10MB

343 backupCount=5,

344 encoding="utf-8",

345 )

346 rotating_handler.setLevel(logging.INFO)

347 rotating_handler.setFormatter(formatter)

348 handlers.append(rotating_handler)

349

350 # Time-based rotating handler (daily rotation)

351 # Rotates daily at midnight, keeps 30 days of logs

352 daily_handler = TimedRotatingFileHandler(

353 filename=logs_dir / f"{self.service_name}-daily.log",

354 when="midnight",

355 interval=1,

356 backupCount=30,

357 encoding="utf-8",

358 )

359 daily_handler.setLevel(logging.INFO)

360 daily_handler.setFormatter(formatter)

361 handlers.append(daily_handler)

362

363 # Error log handler (only ERROR and CRITICAL)

364 error_handler = RotatingFileHandler(

365 filename=logs_dir / f"{self.service_name}-error.log",

366 maxBytes=10 * 1024 * 1024, # 10MB

367 backupCount=5,

368 encoding="utf-8",

369 )

370 error_handler.setLevel(logging.ERROR)

371 error_handler.setFormatter(formatter)

372 handlers.append(error_handler)

373

374 # Configure root logger

375 logging.basicConfig(level=logging.INFO, handlers=handlers)

376

377 self.logger = logging.getLogger(self.service_name)

378 if OBSERVABILITY_VERBOSE:

379 print(f"✓ Logging configured: {self.service_name}")

380 print(f" - Format: {self.log_format.upper()}")

381 print(" - Console output: INFO and above")

382 if enable_file_logging:

383 print(f" - Main log: logs/{self.service_name}.log (rotating, 10MB, 5 backups)")

384 print(f" - Daily log: logs/{self.service_name}-daily.log (daily, 30 days)")

385 print(f" - Error log: logs/{self.service_name}-error.log (ERROR and above)")

386 else:

387 print(" - File logging: disabled (console only)")

388

389 def get_tracer(self) -> trace.Tracer:

390 """Get tracer instance"""

391 return self.tracer

392

393 def get_meter(self) -> otel_metrics.Meter:

394 """Get meter instance"""

395 return self.meter

396

397 def get_logger(self) -> logging.Logger:

398 """Get logger instance"""

399 return self.logger

400

401 def _setup_langsmith(self) -> None:

402 """Configure LangSmith tracing"""

403 try:

404 from mcp_server_langgraph.observability.langsmith import langsmith_config

405

406 if langsmith_config.is_enabled():

407 if OBSERVABILITY_VERBOSE:

408 print("✓ LangSmith integration enabled")

409 print(" - Dual observability: OpenTelemetry + LangSmith")

410 else:

411 if OBSERVABILITY_VERBOSE:

412 print("⚠ LangSmith configured but not enabled (check API key)")

413

414 except ImportError:

415 if OBSERVABILITY_VERBOSE:

416 print("⚠ LangSmith not available (install langsmith package)")

417 except Exception as e:

418 if OBSERVABILITY_VERBOSE:

419 print(f"⚠ LangSmith setup failed: {e}")

420

421

422# ============================================================================

423# Lazy Initialization System

424# ============================================================================

425

426_observability_config: ObservabilityConfig | None = None

427_propagator: TraceContextTextMapPropagator | None = None

428

429

430def is_initialized() -> bool:

431 """Check if observability has been initialized."""

432 return _observability_config is not None

433

434

435def init_observability(

436 settings: Any | None = None,

437 service_name: str = SERVICE_NAME,

438 otlp_endpoint: str = OTLP_ENDPOINT,

439 enable_console_export: bool = True,

440 enable_langsmith: bool = False,

441 log_format: str = "json",

442 log_json_indent: int | None = None,

443 enable_file_logging: bool = False,

444) -> ObservabilityConfig:

445 """

446 Initialize observability system (tracing, metrics, logging).

447

448 MUST be called explicitly by entry points after configuration is loaded.

449 This prevents circular imports and filesystem operations during module import.

450

451 Args:

452 settings: Settings object (optional, will use params if not provided)

453 service_name: Service identifier

454 otlp_endpoint: OpenTelemetry collector endpoint

455 enable_console_export: Export to console for development

456 enable_langsmith: Enable LangSmith integration

457 log_format: "json" or "text"

458 log_json_indent: JSON indent for pretty-printing (None for compact)

459 enable_file_logging: Enable file-based log rotation (opt-in)

460

461 Returns:

462 Initialized ObservabilityConfig instance

463

464 Example:

465 >>> from mcp_server_langgraph.core.config import settings

466 >>> from mcp_server_langgraph.observability.telemetry import init_observability

467 >>> config = init_observability(settings, enable_file_logging=True)

468 """

469 global _observability_config, _propagator

470

471 if _observability_config is not None:

472 # Already initialized - return existing config

473 return _observability_config

474

475 # Extract settings if provided

476 if settings is not None: 476 ↛ 485line 476 didn't jump to line 485 because the condition on line 476 was always true

477 enable_langsmith = settings.langsmith_tracing and settings.observability_backend in ("langsmith", "both")

478 log_format = getattr(settings, "log_format", "json")

479 log_json_indent = getattr(settings, "log_json_indent", None)

480 otlp_endpoint = getattr(settings, "otlp_endpoint", OTLP_ENDPOINT)

481 enable_console_export = getattr(settings, "enable_console_export", True)

482 # enable_file_logging can be overridden via settings

483 enable_file_logging = getattr(settings, "enable_file_logging", enable_file_logging)

484

485 _observability_config = ObservabilityConfig(

486 service_name=service_name,

487 otlp_endpoint=otlp_endpoint,

488 enable_console_export=enable_console_export,

489 enable_langsmith=enable_langsmith,

490 log_format=log_format,

491 log_json_indent=log_json_indent,

492 enable_file_logging=enable_file_logging,

493 )

494

495 _propagator = TraceContextTextMapPropagator()

496

497 return _observability_config

498

499

500def get_config() -> ObservabilityConfig:

501 """

502 Get observability config (lazy accessor).

503

504 Raises RuntimeError if not initialized.

505 """

506 if _observability_config is None: 506 ↛ 507line 506 didn't jump to line 507 because the condition on line 506 was never true

507 msg = (

508 "Observability not initialized. Call init_observability(settings) "

509 "in your entry point before using observability features."

510 )

511 raise RuntimeError(msg)

512 return _observability_config

513

514

515def shutdown_observability() -> None:

516 """

517 Shutdown observability system and flush all pending telemetry data.

518

519 This function should be called during application shutdown to ensure:

520 - All pending spans are flushed to exporters

521 - All pending metrics are exported

522 - HTTP connections to collectors are closed gracefully

523 - No telemetry data is lost

524

525 Usage:

526 - Call from FastAPI lifespan shutdown event

527 - Register with atexit.register() as fallback

528 - Call in exception handlers before exit

529

530 Thread Safety:

531 This function is NOT thread-safe. Call only during shutdown when

532 no new telemetry is being generated.

533 """

534 global _observability_config

535

536 if _observability_config is None:

537 return # Nothing to shutdown

538

539 try:

540 # Flush tracer spans

541 if hasattr(_observability_config, "tracer_provider"): 541 ↛ 555line 541 didn't jump to line 555 because the condition on line 541 was always true

542 tracer_provider = _observability_config.tracer_provider

543 if hasattr(tracer_provider, "force_flush"): 543 ↛ 549line 543 didn't jump to line 549 because the condition on line 543 was always true

544 tracer_provider.force_flush(timeout_millis=5000)

545 if OBSERVABILITY_VERBOSE: 545 ↛ 546line 545 didn't jump to line 546 because the condition on line 545 was never true

546 print("✅ Flushed trace spans", file=sys.stderr)

547

548 # Shutdown span processors

549 if hasattr(tracer_provider, "shutdown"): 549 ↛ 555line 549 didn't jump to line 555 because the condition on line 549 was always true

550 tracer_provider.shutdown()

551 if OBSERVABILITY_VERBOSE: 551 ↛ 552line 551 didn't jump to line 552 because the condition on line 551 was never true

552 print("✅ Shutdown tracer provider", file=sys.stderr)

553

554 # Flush and shutdown meter provider

555 if hasattr(_observability_config, "meter_provider"): 555 ↛ 567line 555 didn't jump to line 567 because the condition on line 555 was always true

556 meter_provider = _observability_config.meter_provider

557 if hasattr(meter_provider, "force_flush"): 557 ↛ 562line 557 didn't jump to line 562 because the condition on line 557 was always true

558 meter_provider.force_flush(timeout_millis=5000)

559 if OBSERVABILITY_VERBOSE: 559 ↛ 560line 559 didn't jump to line 560 because the condition on line 559 was never true

560 print("✅ Flushed metrics", file=sys.stderr)

561

562 if hasattr(meter_provider, "shutdown"): 562 ↛ 567line 562 didn't jump to line 567 because the condition on line 562 was always true

563 meter_provider.shutdown()

564 if OBSERVABILITY_VERBOSE: 564 ↛ 565line 564 didn't jump to line 565 because the condition on line 564 was never true

565 print("✅ Shutdown meter provider", file=sys.stderr)

566

567 if OBSERVABILITY_VERBOSE: 567 ↛ 568line 567 didn't jump to line 568 because the condition on line 567 was never true

568 print("✅ Observability system shutdown complete", file=sys.stderr)

569

570 except Exception as e:

571 # Log error but don't raise - shutdown should be graceful

572 print(f"⚠️ Error during observability shutdown: {e}", file=sys.stderr)

573

574 finally:

575 _observability_config = None # Mark as shutdown

576

577

578# Note: config is available via get_config() function or via the lazy 'config' proxy below

579

580

581def get_tracer() -> Any:

582 """

583 Get tracer instance (lazy accessor with safe fallback).

584

585 Returns the configured tracer if observability is initialized,

586 otherwise returns a no-op tracer that doesn't raise errors.

587

588 Returns:

589 Tracer instance (either ObservabilityConfig tracer or no-op tracer)

590 """

591 if _observability_config is None: 591 ↛ 593line 591 didn't jump to line 593 because the condition on line 591 was never true

592 # Return no-op tracer if observability not initialized

593 from opentelemetry.trace import get_tracer as get_noop_tracer

594

595 return get_noop_tracer(__name__)

596 return get_config().get_tracer()

597

598

599def get_meter() -> Any:

600 """

601 Get meter instance (lazy accessor with safe fallback).

602

603 Returns the configured meter if observability is initialized,

604 otherwise returns a no-op meter that doesn't raise errors.

605

606 Returns:

607 Meter instance (either ObservabilityConfig meter or no-op meter)

608 """

609 if _observability_config is None:

610 # Return no-op meter if observability not initialized

611 from opentelemetry.metrics import get_meter as get_noop_meter

612

613 return get_noop_meter(__name__)

614 return get_config().get_meter()

615

616

617def get_logger() -> Any:

618 """

619 Get logger instance (lazy accessor with safe fallback).

620

621 Returns the configured logger if observability is initialized,

622 otherwise returns a basic Python logger that logs to stderr.

623

624 This prevents RuntimeError when functions like create_user_provider()

625 or create_session_store() are called before init_observability()

626 (e.g., in test fixtures or standalone scripts).

627

628 Returns:

629 Logger instance (either ObservabilityConfig logger or fallback logger)

630 """

631 if _observability_config is None: 631 ↛ 634line 631 didn't jump to line 634 because the condition on line 631 was never true

632 # Return fallback logger if observability not initialized

633 # Uses standard Python logging to stderr

634 fallback_logger = logging.getLogger("mcp-server-langgraph-fallback")

635 if not fallback_logger.handlers:

636 # Configure fallback logger on first use

637 handler = logging.StreamHandler()

638 handler.setFormatter(logging.Formatter("%(levelname)s:%(name)s:%(message)s"))

639 fallback_logger.addHandler(handler)

640 fallback_logger.setLevel(logging.INFO)

641 return fallback_logger

642 return get_config().get_logger()

643

644

645# Module-level exports with lazy initialization

646# These will raise RuntimeError if accessed before init_observability()

647tracer = type("LazyTracer", (), {"__getattr__": lambda self, name: getattr(get_tracer(), name)})()

648

649meter = type("LazyMeter", (), {"__getattr__": lambda self, name: getattr(get_meter(), name)})()

650

651logger = type("LazyLogger", (), {"__getattr__": lambda self, name: getattr(get_logger(), name)})()

652

653# Alias for backward compatibility - provides access to both config and metric instruments

654config = type("LazyConfig", (), {"__getattr__": lambda self, name: getattr(get_config(), name)})()

655

656metrics = config # metrics is an alias for config

657

658

659# Context propagation utilities

660def inject_context(carrier: dict[str, str]) -> None:

661 """Inject trace context into carrier (e.g., HTTP headers)"""

662 if _propagator is None:

663 msg = "Observability not initialized. Call init_observability() first."

664 raise RuntimeError(msg)

665 _propagator.inject(carrier)

666

667

668def extract_context(carrier: dict[str, str]) -> Any:

669 """Extract trace context from carrier"""

670 if _propagator is None:

671 msg = "Observability not initialized. Call init_observability() first."

672 raise RuntimeError(msg)

673 return _propagator.extract(carrier)

674

675

676# Resilience pattern metrics (convenient exports for resilience module)

677# These are lazy proxies that safely no-op if observability not initialized

678

679

680def _safe_metric_operation(metric_name: str, operation: str, *args: Any, **kwargs: Any) -> None:

681 """

682 Safely perform metric operation, no-op if observability not initialized.

683

684 This allows resilience patterns to work even before observability is initialized,

685 which is common in tests and during application startup.

686 """

687 if _observability_config is None:

688 # Observability not initialized - no-op

689 return

690 try:

691 metric = getattr(_observability_config, metric_name)

692 getattr(metric, operation)(*args, **kwargs)

693 except Exception:

694 # Silently ignore metric errors to prevent breaking application logic

695 pass

696

697

698circuit_breaker_state_gauge = type(

699 "LazyMetric",

700 (),

701 {

702 "add": lambda self, *args, **kwargs: _safe_metric_operation("circuit_breaker_state_gauge", "add", *args, **kwargs),

703 "set": lambda self, *args, **kwargs: _safe_metric_operation("circuit_breaker_state_gauge", "set", *args, **kwargs),

704 },

705)()

706

707circuit_breaker_failure_counter = type(

708 "LazyMetric",

709 (),

710 {"add": lambda self, *args, **kwargs: _safe_metric_operation("circuit_breaker_failure_counter", "add", *args, **kwargs)},

711)()

712

713circuit_breaker_success_counter = type(

714 "LazyMetric",

715 (),

716 {"add": lambda self, *args, **kwargs: _safe_metric_operation("circuit_breaker_success_counter", "add", *args, **kwargs)},

717)()

718

719retry_attempt_counter = type(

720 "LazyMetric",

721 (),

722 {"add": lambda self, *args, **kwargs: _safe_metric_operation("retry_attempt_counter", "add", *args, **kwargs)},

723)()

724

725retry_exhausted_counter = type(

726 "LazyMetric",

727 (),

728 {"add": lambda self, *args, **kwargs: _safe_metric_operation("retry_exhausted_counter", "add", *args, **kwargs)},

729)()

730

731retry_success_after_retry_counter = type(

732 "LazyMetric",

733 (),

734 {"add": lambda self, *args, **kwargs: _safe_metric_operation("retry_success_after_retry_counter", "add", *args, **kwargs)},

735)()

736

737timeout_exceeded_counter = type(

738 "LazyMetric",

739 (),

740 {"add": lambda self, *args, **kwargs: _safe_metric_operation("timeout_exceeded_counter", "add", *args, **kwargs)},

741)()

742

743timeout_duration_histogram = type(

744 "LazyMetric",

745 (),

746 {"record": lambda self, *args, **kwargs: _safe_metric_operation("timeout_duration_histogram", "record", *args, **kwargs)},

747)()

748

749bulkhead_rejected_counter = type(

750 "LazyMetric",

751 (),

752 {"add": lambda self, *args, **kwargs: _safe_metric_operation("bulkhead_rejected_counter", "add", *args, **kwargs)},

753)()

754

755bulkhead_active_operations_gauge = type(

756 "LazyMetric",

757 (),

758 {"set": lambda self, *args, **kwargs: _safe_metric_operation("bulkhead_active_operations_gauge", "set", *args, **kwargs)},

759)()

760

761bulkhead_queue_depth_gauge = type(

762 "LazyMetric",

763 (),

764 {"set": lambda self, *args, **kwargs: _safe_metric_operation("bulkhead_queue_depth_gauge", "set", *args, **kwargs)},

765)()

766

767fallback_used_counter = type(

768 "LazyMetric",

769 (),

770 {"add": lambda self, *args, **kwargs: _safe_metric_operation("fallback_used_counter", "add", *args, **kwargs)},

771)()

772

773error_counter = type(

774 "LazyMetric", (), {"add": lambda self, *args, **kwargs: _safe_metric_operation("error_counter", "add", *args, **kwargs)}

775)()

Coverage for src / mcp_server_langgraph / observability / telemetry.py: 57%

296 statements