Coverage for src / mcp_server_langgraph / observability / telemetry.py: 57%
296 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 00:43 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 00:43 +0000
1"""
2Unified observability setup with OpenTelemetry and LangSmith support
3"""
5import logging
6import os
7import sys
8from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler
9from pathlib import Path
10from typing import Any
12from opentelemetry import metrics as otel_metrics
13from opentelemetry import trace
14from opentelemetry.instrumentation.logging import LoggingInstrumentor
15from opentelemetry.sdk.metrics import MeterProvider
16from opentelemetry.sdk.metrics.export import ConsoleMetricExporter, PeriodicExportingMetricReader
17from opentelemetry.sdk.resources import Resource
18from opentelemetry.sdk.trace import TracerProvider
19from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
20from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
22# Conditional imports for OTLP exporters (optional dependencies)
23try:
24 from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as OTLPMetricExporterGRPC
25 from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter as OTLPSpanExporterGRPC
27 GRPC_AVAILABLE = True
28except ImportError:
29 GRPC_AVAILABLE = False
30 OTLPMetricExporterGRPC = None
31 OTLPSpanExporterGRPC = None
33try:
34 from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as OTLPMetricExporterHTTP
35 from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter as OTLPSpanExporterHTTP
37 HTTP_AVAILABLE = True
38except ImportError:
39 HTTP_AVAILABLE = False
40 OTLPMetricExporterHTTP = None # type: ignore[misc, assignment]
41 OTLPSpanExporterHTTP = None # type: ignore[misc, assignment]
43from mcp_server_langgraph.observability.json_logger import CustomJSONFormatter
45# Configuration
46SERVICE_NAME = "mcp-server-langgraph"
47OTLP_ENDPOINT = "http://localhost:4317" # Change to your OTLP collector
49# Control verbose logging (defaults to False to reduce noise)
50# Set OBSERVABILITY_VERBOSE=true to enable detailed initialization logs
51OBSERVABILITY_VERBOSE = os.getenv("OBSERVABILITY_VERBOSE", "false").lower() in ("true", "1", "yes")
54class ObservabilityConfig:
55 """Centralized observability configuration with OpenTelemetry and LangSmith support"""
57 def __init__(
58 self,
59 service_name: str = SERVICE_NAME,
60 otlp_endpoint: str = OTLP_ENDPOINT,
61 enable_console_export: bool = True,
62 enable_langsmith: bool = False,
63 log_format: str = "json", # "json" or "text"
64 log_json_indent: int | None = None, # None for compact, 2 for pretty-print
65 enable_file_logging: bool = False, # NEW: opt-in file logging
66 ):
67 self.service_name = service_name
68 self.otlp_endpoint = otlp_endpoint
69 self.enable_console_export = enable_console_export
70 self.enable_langsmith = enable_langsmith
71 self.log_format = log_format
72 self.log_json_indent = log_json_indent
73 self.enable_file_logging = enable_file_logging
75 # Setup OpenTelemetry
76 self._setup_tracing()
77 self._setup_metrics()
78 self._setup_logging(enable_file_logging=enable_file_logging)
80 # Setup LangSmith if enabled
81 if self.enable_langsmith: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true
82 self._setup_langsmith()
84 def _setup_tracing(self) -> None:
85 """Configure distributed tracing"""
86 # Get version from settings
87 try:
88 from mcp_server_langgraph.core.config import settings
90 service_version = settings.service_version
91 except Exception:
92 from mcp_server_langgraph import __version__
94 service_version = __version__
96 # Get actual environment from settings instead of hardcoding "production"
97 try:
98 from mcp_server_langgraph.core.config import settings as config_settings
100 environment = config_settings.environment
101 except Exception:
102 environment = "unknown"
104 resource = Resource.create(
105 {"service.name": self.service_name, "service.version": service_version, "deployment.environment": environment}
106 )
108 provider = TracerProvider(resource=resource)
110 # OTLP exporter for production (if available)
111 if GRPC_AVAILABLE and OTLPSpanExporterGRPC is not None: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true
112 otlp_exporter = OTLPSpanExporterGRPC(endpoint=self.otlp_endpoint)
113 provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
114 elif HTTP_AVAILABLE and OTLPSpanExporterHTTP is not None: 114 ↛ 117line 114 didn't jump to line 117 because the condition on line 114 was always true
115 otlp_exporter = OTLPSpanExporterHTTP(endpoint=self.otlp_endpoint.replace(":4317", ":4318"))
116 provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
117 elif OBSERVABILITY_VERBOSE:
118 print("⚠ OTLP exporters not available, using console-only tracing")
120 # Console exporter for development
121 if self.enable_console_export: 121 ↛ 125line 121 didn't jump to line 125 because the condition on line 121 was always true
122 console_exporter = ConsoleSpanExporter()
123 provider.add_span_processor(BatchSpanProcessor(console_exporter))
125 trace.set_tracer_provider(provider)
127 self.tracer_provider = provider # Store provider for shutdown
128 self.tracer = trace.get_tracer(__name__)
129 if OBSERVABILITY_VERBOSE: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true
130 print(f"✓ Tracing configured: {self.service_name}")
132 def _setup_metrics(self) -> None:
133 """Configure metrics collection"""
134 resource = Resource.create({"service.name": self.service_name})
136 readers = []
138 # OTLP metric exporter (if available)
139 if GRPC_AVAILABLE and OTLPMetricExporterGRPC is not None: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 otlp_metric_exporter = OTLPMetricExporterGRPC(endpoint=self.otlp_endpoint)
141 otlp_reader = PeriodicExportingMetricReader(otlp_metric_exporter, export_interval_millis=5000)
142 readers.append(otlp_reader)
143 elif HTTP_AVAILABLE and OTLPMetricExporterHTTP is not None: 143 ↛ 147line 143 didn't jump to line 147 because the condition on line 143 was always true
144 otlp_metric_exporter = OTLPMetricExporterHTTP(endpoint=self.otlp_endpoint.replace(":4317", ":4318"))
145 otlp_reader = PeriodicExportingMetricReader(otlp_metric_exporter, export_interval_millis=5000)
146 readers.append(otlp_reader)
147 elif OBSERVABILITY_VERBOSE:
148 print("⚠ OTLP exporters not available, using console-only metrics")
150 # Console exporter for development
151 if self.enable_console_export: 151 ↛ 156line 151 didn't jump to line 156 because the condition on line 151 was always true
152 console_metric_exporter = ConsoleMetricExporter()
153 console_reader = PeriodicExportingMetricReader(console_metric_exporter)
154 readers.append(console_reader)
156 provider = MeterProvider(resource=resource, metric_readers=readers)
158 otel_metrics.set_meter_provider(provider)
159 self.meter_provider = provider # Store provider for shutdown
160 self.meter = otel_metrics.get_meter(__name__)
162 # Create common metrics
163 self._create_metrics()
164 if OBSERVABILITY_VERBOSE: 164 ↛ 165line 164 didn't jump to line 165 because the condition on line 164 was never true
165 print(f"✓ Metrics configured: {self.service_name}")
167 def _create_metrics(self) -> None:
168 """Create standard metrics for the service"""
169 self.tool_calls = self.meter.create_counter(name="agent.tool.calls", description="Number of tool calls", unit="1")
171 self.successful_calls = self.meter.create_counter(
172 name="agent.calls.successful", description="Number of successful calls", unit="1"
173 )
175 self.failed_calls = self.meter.create_counter(
176 name="agent.calls.failed", description="Number of failed calls", unit="1"
177 )
179 self.auth_failures = self.meter.create_counter(
180 name="auth.failures", description="Number of authentication failures", unit="1"
181 )
183 self.authz_failures = self.meter.create_counter(
184 name="authz.failures", description="Number of authorization failures", unit="1"
185 )
187 self.response_time = self.meter.create_histogram(
188 name="agent.response.duration", description="Response time distribution", unit="ms"
189 )
191 # Resilience pattern metrics (ADR-0026)
192 # Circuit breaker metrics
193 self.circuit_breaker_state_gauge = self.meter.create_gauge(
194 name="circuit_breaker.state",
195 description="Circuit breaker state (0=closed, 1=open, 0.5=half-open)",
196 unit="1",
197 )
198 self.circuit_breaker_failure_counter = self.meter.create_counter(
199 name="circuit_breaker.failures",
200 description="Total circuit breaker failures",
201 unit="1",
202 )
203 self.circuit_breaker_success_counter = self.meter.create_counter(
204 name="circuit_breaker.successes",
205 description="Total circuit breaker successes",
206 unit="1",
207 )
209 # Retry metrics
210 self.retry_attempt_counter = self.meter.create_counter(
211 name="retry.attempts",
212 description="Total retry attempts",
213 unit="1",
214 )
215 self.retry_exhausted_counter = self.meter.create_counter(
216 name="retry.exhausted",
217 description="Total retry exhaustion events",
218 unit="1",
219 )
220 self.retry_success_after_retry_counter = self.meter.create_counter(
221 name="retry.success_after_retry",
222 description="Total successful retries",
223 unit="1",
224 )
226 # Timeout metrics
227 self.timeout_exceeded_counter = self.meter.create_counter(
228 name="timeout.exceeded",
229 description="Total timeout violations",
230 unit="1",
231 )
232 self.timeout_duration_histogram = self.meter.create_histogram(
233 name="timeout.duration",
234 description="Timeout duration in seconds",
235 unit="s",
236 )
238 # Bulkhead metrics
239 self.bulkhead_rejected_counter = self.meter.create_counter(
240 name="bulkhead.rejections",
241 description="Total bulkhead rejections",
242 unit="1",
243 )
244 self.bulkhead_active_operations_gauge = self.meter.create_gauge(
245 name="bulkhead.active_operations",
246 description="Current active operations in bulkhead",
247 unit="1",
248 )
249 self.bulkhead_queue_depth_gauge = self.meter.create_gauge(
250 name="bulkhead.queue_depth",
251 description="Current queued operations in bulkhead",
252 unit="1",
253 )
255 # Fallback metrics
256 self.fallback_used_counter = self.meter.create_counter(
257 name="fallback.used",
258 description="Total fallback invocations",
259 unit="1",
260 )
262 # Error counter by type (for custom exceptions)
263 self.error_counter = self.meter.create_counter(
264 name="error.total",
265 description="Total errors by type",
266 unit="1",
267 )
269 # Code execution metrics
270 self.code_executions = self.meter.create_counter(
271 name="code.executions",
272 description="Total code execution requests",
273 unit="1",
274 )
276 def _setup_logging(self, enable_file_logging: bool = False) -> None:
277 """
278 Configure structured logging with OpenTelemetry and optional log rotation.
280 Implements idempotent initialization to prevent duplicate handlers
281 when re-imported or embedded in larger services.
283 Args:
284 enable_file_logging: Enable file-based log rotation (opt-in). Default: False.
285 Set to True for production deployments with persistent storage.
286 Leave False for serverless, containers, or read-only environments.
287 """
288 # Check if logging is already configured (idempotent guard)
289 root_logger = logging.getLogger()
290 if root_logger.handlers: 290 ↛ 298line 290 didn't jump to line 298 because the condition on line 290 was always true
291 # Logging already configured - skip to avoid duplicate handlers
292 self.logger = logging.getLogger(self.service_name)
293 if OBSERVABILITY_VERBOSE: 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true
294 print("✓ Logging already configured, reusing existing setup")
295 return
297 # Instrument logging to include trace context
298 LoggingInstrumentor().instrument(set_logging_format=True)
300 # Choose formatter based on log_format setting
301 formatter: logging.Formatter
302 console_formatter: logging.Formatter
304 if self.log_format == "json":
305 # JSON formatter with trace context
306 formatter = CustomJSONFormatter(
307 service_name=self.service_name,
308 include_hostname=True,
309 indent=self.log_json_indent,
310 )
311 # Console uses pretty-print in development, compact in production
312 console_formatter = CustomJSONFormatter(
313 service_name=self.service_name,
314 include_hostname=False, # Skip hostname for console
315 indent=2 if os.getenv("ENVIRONMENT", "development") == "development" else None,
316 )
317 else:
318 # Text formatter with trace context
319 log_format_str = (
320 "%(asctime)s - %(name)s - %(levelname)s - [trace_id=%(otelTraceID)s span_id=%(otelSpanID)s] - %(message)s"
321 )
322 formatter = logging.Formatter(log_format_str)
323 console_formatter = formatter
325 # Console handler (stdout) - always enabled
326 console_handler = logging.StreamHandler(sys.stdout)
327 console_handler.setLevel(logging.INFO)
328 console_handler.setFormatter(console_formatter)
330 handlers: list[logging.Handler] = [console_handler]
332 # File handlers - opt-in only
333 if enable_file_logging:
334 # Create logs directory if it doesn't exist
335 logs_dir = Path("logs")
336 logs_dir.mkdir(exist_ok=True)
338 # Rotating file handler (size-based rotation)
339 # Rotates when file reaches 10MB, keeps 5 backup files
340 rotating_handler = RotatingFileHandler(
341 filename=logs_dir / f"{self.service_name}.log",
342 maxBytes=10 * 1024 * 1024, # 10MB
343 backupCount=5,
344 encoding="utf-8",
345 )
346 rotating_handler.setLevel(logging.INFO)
347 rotating_handler.setFormatter(formatter)
348 handlers.append(rotating_handler)
350 # Time-based rotating handler (daily rotation)
351 # Rotates daily at midnight, keeps 30 days of logs
352 daily_handler = TimedRotatingFileHandler(
353 filename=logs_dir / f"{self.service_name}-daily.log",
354 when="midnight",
355 interval=1,
356 backupCount=30,
357 encoding="utf-8",
358 )
359 daily_handler.setLevel(logging.INFO)
360 daily_handler.setFormatter(formatter)
361 handlers.append(daily_handler)
363 # Error log handler (only ERROR and CRITICAL)
364 error_handler = RotatingFileHandler(
365 filename=logs_dir / f"{self.service_name}-error.log",
366 maxBytes=10 * 1024 * 1024, # 10MB
367 backupCount=5,
368 encoding="utf-8",
369 )
370 error_handler.setLevel(logging.ERROR)
371 error_handler.setFormatter(formatter)
372 handlers.append(error_handler)
374 # Configure root logger
375 logging.basicConfig(level=logging.INFO, handlers=handlers)
377 self.logger = logging.getLogger(self.service_name)
378 if OBSERVABILITY_VERBOSE:
379 print(f"✓ Logging configured: {self.service_name}")
380 print(f" - Format: {self.log_format.upper()}")
381 print(" - Console output: INFO and above")
382 if enable_file_logging:
383 print(f" - Main log: logs/{self.service_name}.log (rotating, 10MB, 5 backups)")
384 print(f" - Daily log: logs/{self.service_name}-daily.log (daily, 30 days)")
385 print(f" - Error log: logs/{self.service_name}-error.log (ERROR and above)")
386 else:
387 print(" - File logging: disabled (console only)")
389 def get_tracer(self) -> trace.Tracer:
390 """Get tracer instance"""
391 return self.tracer
393 def get_meter(self) -> otel_metrics.Meter:
394 """Get meter instance"""
395 return self.meter
397 def get_logger(self) -> logging.Logger:
398 """Get logger instance"""
399 return self.logger
401 def _setup_langsmith(self) -> None:
402 """Configure LangSmith tracing"""
403 try:
404 from mcp_server_langgraph.observability.langsmith import langsmith_config
406 if langsmith_config.is_enabled():
407 if OBSERVABILITY_VERBOSE:
408 print("✓ LangSmith integration enabled")
409 print(" - Dual observability: OpenTelemetry + LangSmith")
410 else:
411 if OBSERVABILITY_VERBOSE:
412 print("⚠ LangSmith configured but not enabled (check API key)")
414 except ImportError:
415 if OBSERVABILITY_VERBOSE:
416 print("⚠ LangSmith not available (install langsmith package)")
417 except Exception as e:
418 if OBSERVABILITY_VERBOSE:
419 print(f"⚠ LangSmith setup failed: {e}")
422# ============================================================================
423# Lazy Initialization System
424# ============================================================================
426_observability_config: ObservabilityConfig | None = None
427_propagator: TraceContextTextMapPropagator | None = None
430def is_initialized() -> bool:
431 """Check if observability has been initialized."""
432 return _observability_config is not None
435def init_observability(
436 settings: Any | None = None,
437 service_name: str = SERVICE_NAME,
438 otlp_endpoint: str = OTLP_ENDPOINT,
439 enable_console_export: bool = True,
440 enable_langsmith: bool = False,
441 log_format: str = "json",
442 log_json_indent: int | None = None,
443 enable_file_logging: bool = False,
444) -> ObservabilityConfig:
445 """
446 Initialize observability system (tracing, metrics, logging).
448 MUST be called explicitly by entry points after configuration is loaded.
449 This prevents circular imports and filesystem operations during module import.
451 Args:
452 settings: Settings object (optional, will use params if not provided)
453 service_name: Service identifier
454 otlp_endpoint: OpenTelemetry collector endpoint
455 enable_console_export: Export to console for development
456 enable_langsmith: Enable LangSmith integration
457 log_format: "json" or "text"
458 log_json_indent: JSON indent for pretty-printing (None for compact)
459 enable_file_logging: Enable file-based log rotation (opt-in)
461 Returns:
462 Initialized ObservabilityConfig instance
464 Example:
465 >>> from mcp_server_langgraph.core.config import settings
466 >>> from mcp_server_langgraph.observability.telemetry import init_observability
467 >>> config = init_observability(settings, enable_file_logging=True)
468 """
469 global _observability_config, _propagator
471 if _observability_config is not None:
472 # Already initialized - return existing config
473 return _observability_config
475 # Extract settings if provided
476 if settings is not None: 476 ↛ 485line 476 didn't jump to line 485 because the condition on line 476 was always true
477 enable_langsmith = settings.langsmith_tracing and settings.observability_backend in ("langsmith", "both")
478 log_format = getattr(settings, "log_format", "json")
479 log_json_indent = getattr(settings, "log_json_indent", None)
480 otlp_endpoint = getattr(settings, "otlp_endpoint", OTLP_ENDPOINT)
481 enable_console_export = getattr(settings, "enable_console_export", True)
482 # enable_file_logging can be overridden via settings
483 enable_file_logging = getattr(settings, "enable_file_logging", enable_file_logging)
485 _observability_config = ObservabilityConfig(
486 service_name=service_name,
487 otlp_endpoint=otlp_endpoint,
488 enable_console_export=enable_console_export,
489 enable_langsmith=enable_langsmith,
490 log_format=log_format,
491 log_json_indent=log_json_indent,
492 enable_file_logging=enable_file_logging,
493 )
495 _propagator = TraceContextTextMapPropagator()
497 return _observability_config
500def get_config() -> ObservabilityConfig:
501 """
502 Get observability config (lazy accessor).
504 Raises RuntimeError if not initialized.
505 """
506 if _observability_config is None: 506 ↛ 507line 506 didn't jump to line 507 because the condition on line 506 was never true
507 msg = (
508 "Observability not initialized. Call init_observability(settings) "
509 "in your entry point before using observability features."
510 )
511 raise RuntimeError(msg)
512 return _observability_config
515def shutdown_observability() -> None:
516 """
517 Shutdown observability system and flush all pending telemetry data.
519 This function should be called during application shutdown to ensure:
520 - All pending spans are flushed to exporters
521 - All pending metrics are exported
522 - HTTP connections to collectors are closed gracefully
523 - No telemetry data is lost
525 Usage:
526 - Call from FastAPI lifespan shutdown event
527 - Register with atexit.register() as fallback
528 - Call in exception handlers before exit
530 Thread Safety:
531 This function is NOT thread-safe. Call only during shutdown when
532 no new telemetry is being generated.
533 """
534 global _observability_config
536 if _observability_config is None:
537 return # Nothing to shutdown
539 try:
540 # Flush tracer spans
541 if hasattr(_observability_config, "tracer_provider"): 541 ↛ 555line 541 didn't jump to line 555 because the condition on line 541 was always true
542 tracer_provider = _observability_config.tracer_provider
543 if hasattr(tracer_provider, "force_flush"): 543 ↛ 549line 543 didn't jump to line 549 because the condition on line 543 was always true
544 tracer_provider.force_flush(timeout_millis=5000)
545 if OBSERVABILITY_VERBOSE: 545 ↛ 546line 545 didn't jump to line 546 because the condition on line 545 was never true
546 print("✅ Flushed trace spans", file=sys.stderr)
548 # Shutdown span processors
549 if hasattr(tracer_provider, "shutdown"): 549 ↛ 555line 549 didn't jump to line 555 because the condition on line 549 was always true
550 tracer_provider.shutdown()
551 if OBSERVABILITY_VERBOSE: 551 ↛ 552line 551 didn't jump to line 552 because the condition on line 551 was never true
552 print("✅ Shutdown tracer provider", file=sys.stderr)
554 # Flush and shutdown meter provider
555 if hasattr(_observability_config, "meter_provider"): 555 ↛ 567line 555 didn't jump to line 567 because the condition on line 555 was always true
556 meter_provider = _observability_config.meter_provider
557 if hasattr(meter_provider, "force_flush"): 557 ↛ 562line 557 didn't jump to line 562 because the condition on line 557 was always true
558 meter_provider.force_flush(timeout_millis=5000)
559 if OBSERVABILITY_VERBOSE: 559 ↛ 560line 559 didn't jump to line 560 because the condition on line 559 was never true
560 print("✅ Flushed metrics", file=sys.stderr)
562 if hasattr(meter_provider, "shutdown"): 562 ↛ 567line 562 didn't jump to line 567 because the condition on line 562 was always true
563 meter_provider.shutdown()
564 if OBSERVABILITY_VERBOSE: 564 ↛ 565line 564 didn't jump to line 565 because the condition on line 564 was never true
565 print("✅ Shutdown meter provider", file=sys.stderr)
567 if OBSERVABILITY_VERBOSE: 567 ↛ 568line 567 didn't jump to line 568 because the condition on line 567 was never true
568 print("✅ Observability system shutdown complete", file=sys.stderr)
570 except Exception as e:
571 # Log error but don't raise - shutdown should be graceful
572 print(f"⚠️ Error during observability shutdown: {e}", file=sys.stderr)
574 finally:
575 _observability_config = None # Mark as shutdown
578# Note: config is available via get_config() function or via the lazy 'config' proxy below
581def get_tracer() -> Any:
582 """
583 Get tracer instance (lazy accessor with safe fallback).
585 Returns the configured tracer if observability is initialized,
586 otherwise returns a no-op tracer that doesn't raise errors.
588 Returns:
589 Tracer instance (either ObservabilityConfig tracer or no-op tracer)
590 """
591 if _observability_config is None: 591 ↛ 593line 591 didn't jump to line 593 because the condition on line 591 was never true
592 # Return no-op tracer if observability not initialized
593 from opentelemetry.trace import get_tracer as get_noop_tracer
595 return get_noop_tracer(__name__)
596 return get_config().get_tracer()
599def get_meter() -> Any:
600 """
601 Get meter instance (lazy accessor with safe fallback).
603 Returns the configured meter if observability is initialized,
604 otherwise returns a no-op meter that doesn't raise errors.
606 Returns:
607 Meter instance (either ObservabilityConfig meter or no-op meter)
608 """
609 if _observability_config is None:
610 # Return no-op meter if observability not initialized
611 from opentelemetry.metrics import get_meter as get_noop_meter
613 return get_noop_meter(__name__)
614 return get_config().get_meter()
617def get_logger() -> Any:
618 """
619 Get logger instance (lazy accessor with safe fallback).
621 Returns the configured logger if observability is initialized,
622 otherwise returns a basic Python logger that logs to stderr.
624 This prevents RuntimeError when functions like create_user_provider()
625 or create_session_store() are called before init_observability()
626 (e.g., in test fixtures or standalone scripts).
628 Returns:
629 Logger instance (either ObservabilityConfig logger or fallback logger)
630 """
631 if _observability_config is None: 631 ↛ 634line 631 didn't jump to line 634 because the condition on line 631 was never true
632 # Return fallback logger if observability not initialized
633 # Uses standard Python logging to stderr
634 fallback_logger = logging.getLogger("mcp-server-langgraph-fallback")
635 if not fallback_logger.handlers:
636 # Configure fallback logger on first use
637 handler = logging.StreamHandler()
638 handler.setFormatter(logging.Formatter("%(levelname)s:%(name)s:%(message)s"))
639 fallback_logger.addHandler(handler)
640 fallback_logger.setLevel(logging.INFO)
641 return fallback_logger
642 return get_config().get_logger()
645# Module-level exports with lazy initialization
646# These will raise RuntimeError if accessed before init_observability()
647tracer = type("LazyTracer", (), {"__getattr__": lambda self, name: getattr(get_tracer(), name)})()
649meter = type("LazyMeter", (), {"__getattr__": lambda self, name: getattr(get_meter(), name)})()
651logger = type("LazyLogger", (), {"__getattr__": lambda self, name: getattr(get_logger(), name)})()
653# Alias for backward compatibility - provides access to both config and metric instruments
654config = type("LazyConfig", (), {"__getattr__": lambda self, name: getattr(get_config(), name)})()
656metrics = config # metrics is an alias for config
659# Context propagation utilities
660def inject_context(carrier: dict[str, str]) -> None:
661 """Inject trace context into carrier (e.g., HTTP headers)"""
662 if _propagator is None:
663 msg = "Observability not initialized. Call init_observability() first."
664 raise RuntimeError(msg)
665 _propagator.inject(carrier)
668def extract_context(carrier: dict[str, str]) -> Any:
669 """Extract trace context from carrier"""
670 if _propagator is None:
671 msg = "Observability not initialized. Call init_observability() first."
672 raise RuntimeError(msg)
673 return _propagator.extract(carrier)
676# Resilience pattern metrics (convenient exports for resilience module)
677# These are lazy proxies that safely no-op if observability not initialized
680def _safe_metric_operation(metric_name: str, operation: str, *args: Any, **kwargs: Any) -> None:
681 """
682 Safely perform metric operation, no-op if observability not initialized.
684 This allows resilience patterns to work even before observability is initialized,
685 which is common in tests and during application startup.
686 """
687 if _observability_config is None:
688 # Observability not initialized - no-op
689 return
690 try:
691 metric = getattr(_observability_config, metric_name)
692 getattr(metric, operation)(*args, **kwargs)
693 except Exception:
694 # Silently ignore metric errors to prevent breaking application logic
695 pass
698circuit_breaker_state_gauge = type(
699 "LazyMetric",
700 (),
701 {
702 "add": lambda self, *args, **kwargs: _safe_metric_operation("circuit_breaker_state_gauge", "add", *args, **kwargs),
703 "set": lambda self, *args, **kwargs: _safe_metric_operation("circuit_breaker_state_gauge", "set", *args, **kwargs),
704 },
705)()
707circuit_breaker_failure_counter = type(
708 "LazyMetric",
709 (),
710 {"add": lambda self, *args, **kwargs: _safe_metric_operation("circuit_breaker_failure_counter", "add", *args, **kwargs)},
711)()
713circuit_breaker_success_counter = type(
714 "LazyMetric",
715 (),
716 {"add": lambda self, *args, **kwargs: _safe_metric_operation("circuit_breaker_success_counter", "add", *args, **kwargs)},
717)()
719retry_attempt_counter = type(
720 "LazyMetric",
721 (),
722 {"add": lambda self, *args, **kwargs: _safe_metric_operation("retry_attempt_counter", "add", *args, **kwargs)},
723)()
725retry_exhausted_counter = type(
726 "LazyMetric",
727 (),
728 {"add": lambda self, *args, **kwargs: _safe_metric_operation("retry_exhausted_counter", "add", *args, **kwargs)},
729)()
731retry_success_after_retry_counter = type(
732 "LazyMetric",
733 (),
734 {"add": lambda self, *args, **kwargs: _safe_metric_operation("retry_success_after_retry_counter", "add", *args, **kwargs)},
735)()
737timeout_exceeded_counter = type(
738 "LazyMetric",
739 (),
740 {"add": lambda self, *args, **kwargs: _safe_metric_operation("timeout_exceeded_counter", "add", *args, **kwargs)},
741)()
743timeout_duration_histogram = type(
744 "LazyMetric",
745 (),
746 {"record": lambda self, *args, **kwargs: _safe_metric_operation("timeout_duration_histogram", "record", *args, **kwargs)},
747)()
749bulkhead_rejected_counter = type(
750 "LazyMetric",
751 (),
752 {"add": lambda self, *args, **kwargs: _safe_metric_operation("bulkhead_rejected_counter", "add", *args, **kwargs)},
753)()
755bulkhead_active_operations_gauge = type(
756 "LazyMetric",
757 (),
758 {"set": lambda self, *args, **kwargs: _safe_metric_operation("bulkhead_active_operations_gauge", "set", *args, **kwargs)},
759)()
761bulkhead_queue_depth_gauge = type(
762 "LazyMetric",
763 (),
764 {"set": lambda self, *args, **kwargs: _safe_metric_operation("bulkhead_queue_depth_gauge", "set", *args, **kwargs)},
765)()
767fallback_used_counter = type(
768 "LazyMetric",
769 (),
770 {"add": lambda self, *args, **kwargs: _safe_metric_operation("fallback_used_counter", "add", *args, **kwargs)},
771)()
773error_counter = type(
774 "LazyMetric", (), {"add": lambda self, *args, **kwargs: _safe_metric_operation("error_counter", "add", *args, **kwargs)}
775)()