Coverage for src / mcp_server_langgraph / utils / response_optimizer.py: 89%
59 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 00:43 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 00:43 +0000
1"""
2Response optimization utilities for token-efficient tool responses.
4Implements Anthropic's best practices for writing tools for agents:
5- Token counting and truncation
6- Response format control (concise vs detailed)
7- High-signal information filtering
8"""
10from typing import Any, Literal
12import litellm
14from mcp_server_langgraph.observability.telemetry import logger
16# Maximum tokens per response (Anthropic recommendation: ~25k tokens)
17MAX_RESPONSE_TOKENS = 25000
18DEFAULT_CONCISE_TOKENS = 500
19DEFAULT_DETAILED_TOKENS = 2000
22class ResponseOptimizer:
23 """
24 Utility class for optimizing tool responses for agent consumption.
26 Features:
27 - Token counting using tiktoken
28 - Response truncation with helpful messages
29 - Format control (concise vs detailed)
30 - High-signal information extraction
31 """
33 def __init__(self, model: str = "gpt-4") -> None:
34 """
35 Initialize response optimizer.
37 Args:
38 model: Model name for token encoding (default: gpt-4)
39 """
40 self.model = model
41 # Removed logger call here to avoid observability initialization issues at module import time
42 # Logger will be used when methods are actually called
44 def count_tokens(self, text: str) -> int:
45 """
46 Count tokens in text using LiteLLM model-aware token counting.
48 SECURITY (OpenAI Codex Finding #4):
49 Uses litellm.token_counter() which now supports Gemini, GPT, Claude, and other models.
50 Fallback to len(text)//4 is kept for compatibility but logs warning for monitoring.
52 Args:
53 text: Text to count tokens for
55 Returns:
56 Number of tokens
58 Note:
59 - Gemini models: Supported by litellm (tested)
60 - OpenAI models: Supported via tiktoken (tested)
61 - Claude models: Supported by litellm (tested)
62 - Fallback: len(text)//4 (conservative, but inaccurate - monitor warnings)
63 """
64 if not text:
65 return 0 # Empty text = 0 tokens
67 try:
68 # Use LiteLLM's model-aware token counting
69 token_count: int = litellm.token_counter(model=self.model, text=text) # type: ignore[attr-defined]
70 return token_count
71 except Exception as e:
72 # SECURITY: Log fallback usage for monitoring
73 # If you see these warnings frequently, consider:
74 # 1. Updating litellm to latest version
75 # 2. Adding provider-specific tokenizer for this model
76 # 3. Switching to a supported model
77 logger.warning(
78 f"LiteLLM token counting failed for model {self.model}, using fallback estimate (len/4). "
79 f"This may be inaccurate and affect context budget management. Error: {e}",
80 extra={
81 "model": self.model,
82 "text_length": len(text),
83 "estimated_tokens": len(text) // 4,
84 "error_type": type(e).__name__,
85 },
86 )
87 return len(text) // 4
89 def truncate_response(
90 self, content: str, max_tokens: int = MAX_RESPONSE_TOKENS, truncation_message: str | None = None
91 ) -> tuple[str, bool]:
92 """
93 Truncate response to fit within token limit using LiteLLM token counting.
95 Args:
96 content: Response content to truncate
97 max_tokens: Maximum tokens allowed
98 truncation_message: Custom message to append when truncated
100 Returns:
101 Tuple of (truncated_content, was_truncated)
102 """
103 # Count tokens using LiteLLM
104 current_tokens = self.count_tokens(content)
106 if current_tokens <= max_tokens:
107 return content, False
109 # Reserve tokens for truncation message
110 if truncation_message is None:
111 truncation_message = (
112 "\n\n[Response truncated due to length. "
113 "Use more specific filters or request detailed format for full results.]"
114 )
116 message_tokens = self.count_tokens(truncation_message)
117 available_tokens = max_tokens - message_tokens
119 if available_tokens <= 0: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true
120 logger.warning(
121 "Truncation message too long for max_tokens",
122 extra={"max_tokens": max_tokens, "message_tokens": message_tokens},
123 )
124 available_tokens = max(100, max_tokens - 50)
126 # Character-based truncation with token counting
127 # Estimate characters per token (roughly 4:1 ratio)
128 estimated_chars = available_tokens * 4
129 truncated_text = content[:estimated_chars]
131 # Iteratively adjust until within token limit
132 while self.count_tokens(truncated_text) > available_tokens and len(truncated_text) > 100: 132 ↛ 134line 132 didn't jump to line 134 because the condition on line 132 was never true
133 # Reduce by 10% each iteration
134 truncated_text = truncated_text[: int(len(truncated_text) * 0.9)]
136 final_tokens = self.count_tokens(truncated_text)
138 logger.info(
139 "Response truncated",
140 extra={
141 "original_tokens": current_tokens,
142 "truncated_tokens": final_tokens,
143 "truncation_ratio": final_tokens / current_tokens if current_tokens > 0 else 0,
144 },
145 )
147 return truncated_text + truncation_message, True
149 def format_response(
150 self, content: str, format_type: Literal["concise", "detailed"] = "concise", max_tokens: int | None = None
151 ) -> str:
152 """
153 Format response according to specified format type.
155 Args:
156 content: Original response content
157 format_type: "concise" or "detailed"
158 max_tokens: Override default token limits
160 Returns:
161 Formatted response
162 """
163 # Determine token limit based on format
164 if max_tokens is None:
165 max_tokens = DEFAULT_CONCISE_TOKENS if format_type == "concise" else DEFAULT_DETAILED_TOKENS
167 # Truncate if necessary
168 formatted_content, was_truncated = self.truncate_response(
169 content,
170 max_tokens=max_tokens,
171 truncation_message=(
172 f"\n\n[Response truncated to {format_type} format. Request 'detailed' format for more information.]"
173 if format_type == "concise"
174 else None
175 ),
176 )
178 return formatted_content
180 def extract_high_signal(self, data: dict[str, Any], exclude_fields: list[str] | None = None) -> dict[str, Any]:
181 """
182 Extract high-signal information from data, removing low-value technical fields.
184 Following Anthropic's guidance: "Avoid low-level technical identifiers
185 (uuid, mime_type) in favor of human-readable fields (name, file_type)"
187 Args:
188 data: Dictionary of data
189 exclude_fields: Additional fields to exclude
191 Returns:
192 Dictionary with only high-signal fields
193 """
194 # Default low-signal fields to exclude
195 low_signal_fields = {
196 "uuid",
197 "guid",
198 "mime_type",
199 "content_type",
200 "created_at_timestamp",
201 "updated_at_timestamp",
202 "internal_id",
203 "trace_id",
204 "span_id",
205 }
207 if exclude_fields:
208 low_signal_fields.update(exclude_fields)
210 # Filter out low-signal fields
211 filtered = {key: value for key, value in data.items() if key not in low_signal_fields}
213 return filtered
216# Global instance for convenience (uses default model)
217# Note: For model-specific counting, create ResponseOptimizer with specific model
218_optimizer = ResponseOptimizer()
221def count_tokens(text: str, model: str | None = None) -> int:
222 """
223 Count tokens in text using LiteLLM model-aware counting.
225 Args:
226 text: Text to count tokens for
227 model: Optional model name for accurate counting (uses global default if None)
229 Returns:
230 Number of tokens
231 """
232 if model:
233 # Use model-specific optimizer for accurate counting
234 optimizer = ResponseOptimizer(model=model)
235 return optimizer.count_tokens(text)
236 else:
237 # Use global optimizer with default model
238 return _optimizer.count_tokens(text)
241def truncate_response(
242 content: str, max_tokens: int = MAX_RESPONSE_TOKENS, truncation_message: str | None = None
243) -> tuple[str, bool]:
244 """Truncate response using global optimizer."""
245 return _optimizer.truncate_response(content, max_tokens, truncation_message)
248def format_response(
249 content: str, format_type: Literal["concise", "detailed"] = "concise", max_tokens: int | None = None
250) -> str:
251 """Format response using global optimizer."""
252 return _optimizer.format_response(content, format_type, max_tokens)
255def extract_high_signal(data: dict[str, Any], exclude_fields: list[str] | None = None) -> dict[str, Any]:
256 """Extract high-signal information using global optimizer."""
257 return _optimizer.extract_high_signal(data, exclude_fields)