Coverage for src / mcp_server_langgraph / utils / response_optimizer.py: 89%

59 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-03 00:43 +0000

1""" 

2Response optimization utilities for token-efficient tool responses. 

3 

4Implements Anthropic's best practices for writing tools for agents: 

5- Token counting and truncation 

6- Response format control (concise vs detailed) 

7- High-signal information filtering 

8""" 

9 

10from typing import Any, Literal 

11 

12import litellm 

13 

14from mcp_server_langgraph.observability.telemetry import logger 

15 

16# Maximum tokens per response (Anthropic recommendation: ~25k tokens) 

17MAX_RESPONSE_TOKENS = 25000 

18DEFAULT_CONCISE_TOKENS = 500 

19DEFAULT_DETAILED_TOKENS = 2000 

20 

21 

22class ResponseOptimizer: 

23 """ 

24 Utility class for optimizing tool responses for agent consumption. 

25 

26 Features: 

27 - Token counting using tiktoken 

28 - Response truncation with helpful messages 

29 - Format control (concise vs detailed) 

30 - High-signal information extraction 

31 """ 

32 

33 def __init__(self, model: str = "gpt-4") -> None: 

34 """ 

35 Initialize response optimizer. 

36 

37 Args: 

38 model: Model name for token encoding (default: gpt-4) 

39 """ 

40 self.model = model 

41 # Removed logger call here to avoid observability initialization issues at module import time 

42 # Logger will be used when methods are actually called 

43 

44 def count_tokens(self, text: str) -> int: 

45 """ 

46 Count tokens in text using LiteLLM model-aware token counting. 

47 

48 SECURITY (OpenAI Codex Finding #4): 

49 Uses litellm.token_counter() which now supports Gemini, GPT, Claude, and other models. 

50 Fallback to len(text)//4 is kept for compatibility but logs warning for monitoring. 

51 

52 Args: 

53 text: Text to count tokens for 

54 

55 Returns: 

56 Number of tokens 

57 

58 Note: 

59 - Gemini models: Supported by litellm (tested) 

60 - OpenAI models: Supported via tiktoken (tested) 

61 - Claude models: Supported by litellm (tested) 

62 - Fallback: len(text)//4 (conservative, but inaccurate - monitor warnings) 

63 """ 

64 if not text: 

65 return 0 # Empty text = 0 tokens 

66 

67 try: 

68 # Use LiteLLM's model-aware token counting 

69 token_count: int = litellm.token_counter(model=self.model, text=text) # type: ignore[attr-defined] 

70 return token_count 

71 except Exception as e: 

72 # SECURITY: Log fallback usage for monitoring 

73 # If you see these warnings frequently, consider: 

74 # 1. Updating litellm to latest version 

75 # 2. Adding provider-specific tokenizer for this model 

76 # 3. Switching to a supported model 

77 logger.warning( 

78 f"LiteLLM token counting failed for model {self.model}, using fallback estimate (len/4). " 

79 f"This may be inaccurate and affect context budget management. Error: {e}", 

80 extra={ 

81 "model": self.model, 

82 "text_length": len(text), 

83 "estimated_tokens": len(text) // 4, 

84 "error_type": type(e).__name__, 

85 }, 

86 ) 

87 return len(text) // 4 

88 

89 def truncate_response( 

90 self, content: str, max_tokens: int = MAX_RESPONSE_TOKENS, truncation_message: str | None = None 

91 ) -> tuple[str, bool]: 

92 """ 

93 Truncate response to fit within token limit using LiteLLM token counting. 

94 

95 Args: 

96 content: Response content to truncate 

97 max_tokens: Maximum tokens allowed 

98 truncation_message: Custom message to append when truncated 

99 

100 Returns: 

101 Tuple of (truncated_content, was_truncated) 

102 """ 

103 # Count tokens using LiteLLM 

104 current_tokens = self.count_tokens(content) 

105 

106 if current_tokens <= max_tokens: 

107 return content, False 

108 

109 # Reserve tokens for truncation message 

110 if truncation_message is None: 

111 truncation_message = ( 

112 "\n\n[Response truncated due to length. " 

113 "Use more specific filters or request detailed format for full results.]" 

114 ) 

115 

116 message_tokens = self.count_tokens(truncation_message) 

117 available_tokens = max_tokens - message_tokens 

118 

119 if available_tokens <= 0: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true

120 logger.warning( 

121 "Truncation message too long for max_tokens", 

122 extra={"max_tokens": max_tokens, "message_tokens": message_tokens}, 

123 ) 

124 available_tokens = max(100, max_tokens - 50) 

125 

126 # Character-based truncation with token counting 

127 # Estimate characters per token (roughly 4:1 ratio) 

128 estimated_chars = available_tokens * 4 

129 truncated_text = content[:estimated_chars] 

130 

131 # Iteratively adjust until within token limit 

132 while self.count_tokens(truncated_text) > available_tokens and len(truncated_text) > 100: 132 ↛ 134line 132 didn't jump to line 134 because the condition on line 132 was never true

133 # Reduce by 10% each iteration 

134 truncated_text = truncated_text[: int(len(truncated_text) * 0.9)] 

135 

136 final_tokens = self.count_tokens(truncated_text) 

137 

138 logger.info( 

139 "Response truncated", 

140 extra={ 

141 "original_tokens": current_tokens, 

142 "truncated_tokens": final_tokens, 

143 "truncation_ratio": final_tokens / current_tokens if current_tokens > 0 else 0, 

144 }, 

145 ) 

146 

147 return truncated_text + truncation_message, True 

148 

149 def format_response( 

150 self, content: str, format_type: Literal["concise", "detailed"] = "concise", max_tokens: int | None = None 

151 ) -> str: 

152 """ 

153 Format response according to specified format type. 

154 

155 Args: 

156 content: Original response content 

157 format_type: "concise" or "detailed" 

158 max_tokens: Override default token limits 

159 

160 Returns: 

161 Formatted response 

162 """ 

163 # Determine token limit based on format 

164 if max_tokens is None: 

165 max_tokens = DEFAULT_CONCISE_TOKENS if format_type == "concise" else DEFAULT_DETAILED_TOKENS 

166 

167 # Truncate if necessary 

168 formatted_content, was_truncated = self.truncate_response( 

169 content, 

170 max_tokens=max_tokens, 

171 truncation_message=( 

172 f"\n\n[Response truncated to {format_type} format. Request 'detailed' format for more information.]" 

173 if format_type == "concise" 

174 else None 

175 ), 

176 ) 

177 

178 return formatted_content 

179 

180 def extract_high_signal(self, data: dict[str, Any], exclude_fields: list[str] | None = None) -> dict[str, Any]: 

181 """ 

182 Extract high-signal information from data, removing low-value technical fields. 

183 

184 Following Anthropic's guidance: "Avoid low-level technical identifiers 

185 (uuid, mime_type) in favor of human-readable fields (name, file_type)" 

186 

187 Args: 

188 data: Dictionary of data 

189 exclude_fields: Additional fields to exclude 

190 

191 Returns: 

192 Dictionary with only high-signal fields 

193 """ 

194 # Default low-signal fields to exclude 

195 low_signal_fields = { 

196 "uuid", 

197 "guid", 

198 "mime_type", 

199 "content_type", 

200 "created_at_timestamp", 

201 "updated_at_timestamp", 

202 "internal_id", 

203 "trace_id", 

204 "span_id", 

205 } 

206 

207 if exclude_fields: 

208 low_signal_fields.update(exclude_fields) 

209 

210 # Filter out low-signal fields 

211 filtered = {key: value for key, value in data.items() if key not in low_signal_fields} 

212 

213 return filtered 

214 

215 

216# Global instance for convenience (uses default model) 

217# Note: For model-specific counting, create ResponseOptimizer with specific model 

218_optimizer = ResponseOptimizer() 

219 

220 

221def count_tokens(text: str, model: str | None = None) -> int: 

222 """ 

223 Count tokens in text using LiteLLM model-aware counting. 

224 

225 Args: 

226 text: Text to count tokens for 

227 model: Optional model name for accurate counting (uses global default if None) 

228 

229 Returns: 

230 Number of tokens 

231 """ 

232 if model: 

233 # Use model-specific optimizer for accurate counting 

234 optimizer = ResponseOptimizer(model=model) 

235 return optimizer.count_tokens(text) 

236 else: 

237 # Use global optimizer with default model 

238 return _optimizer.count_tokens(text) 

239 

240 

241def truncate_response( 

242 content: str, max_tokens: int = MAX_RESPONSE_TOKENS, truncation_message: str | None = None 

243) -> tuple[str, bool]: 

244 """Truncate response using global optimizer.""" 

245 return _optimizer.truncate_response(content, max_tokens, truncation_message) 

246 

247 

248def format_response( 

249 content: str, format_type: Literal["concise", "detailed"] = "concise", max_tokens: int | None = None 

250) -> str: 

251 """Format response using global optimizer.""" 

252 return _optimizer.format_response(content, format_type, max_tokens) 

253 

254 

255def extract_high_signal(data: dict[str, Any], exclude_fields: list[str] | None = None) -> dict[str, Any]: 

256 """Extract high-signal information using global optimizer.""" 

257 return _optimizer.extract_high_signal(data, exclude_fields)