Coverage for src/mcp_server_langgraph/utils/response

1"""

2Response optimization utilities for token-efficient tool responses.

4Implements Anthropic's best practices for writing tools for agents:

5- Token counting and truncation

6- Response format control (concise vs detailed)

7- High-signal information filtering

8"""

10from typing import Any, Literal

12import litellm

14from mcp_server_langgraph.observability.telemetry import logger

16# Maximum tokens per response (Anthropic recommendation: ~25k tokens)

17MAX_RESPONSE_TOKENS = 25000

18DEFAULT_CONCISE_TOKENS = 500

19DEFAULT_DETAILED_TOKENS = 2000

22class ResponseOptimizer:

23 """

24 Utility class for optimizing tool responses for agent consumption.

26 Features:

27 - Token counting using tiktoken

28 - Response truncation with helpful messages

29 - Format control (concise vs detailed)

30 - High-signal information extraction

31 """

33 def __init__(self, model: str = "gpt-4") -> None:

34 """

35 Initialize response optimizer.

37 Args:

38 model: Model name for token encoding (default: gpt-4)

39 """

40 self.model = model

41 # Removed logger call here to avoid observability initialization issues at module import time

42 # Logger will be used when methods are actually called

44 def count_tokens(self, text: str) -> int:

45 """

46 Count tokens in text using LiteLLM model-aware token counting.

48 SECURITY (OpenAI Codex Finding #4):

49 Uses litellm.token_counter() which now supports Gemini, GPT, Claude, and other models.

50 Fallback to len(text)//4 is kept for compatibility but logs warning for monitoring.

52 Args:

53 text: Text to count tokens for

55 Returns:

56 Number of tokens

58 Note:

59 - Gemini models: Supported by litellm (tested)

60 - OpenAI models: Supported via tiktoken (tested)

61 - Claude models: Supported by litellm (tested)

62 - Fallback: len(text)//4 (conservative, but inaccurate - monitor warnings)

63 """

64 if not text:

65 return 0 # Empty text = 0 tokens

67 try:

68 # Use LiteLLM's model-aware token counting

69 token_count: int = litellm.token_counter(model=self.model, text=text) # type: ignore[attr-defined]

70 return token_count

71 except Exception as e:

72 # SECURITY: Log fallback usage for monitoring

73 # If you see these warnings frequently, consider:

74 # 1. Updating litellm to latest version

75 # 2. Adding provider-specific tokenizer for this model

76 # 3. Switching to a supported model

77 logger.warning(

78 f"LiteLLM token counting failed for model {self.model}, using fallback estimate (len/4). "

79 f"This may be inaccurate and affect context budget management. Error: {e}",

80 extra={

81 "model": self.model,

82 "text_length": len(text),

83 "estimated_tokens": len(text) // 4,

84 "error_type": type(e).__name__,

85 },

86 )

87 return len(text) // 4

89 def truncate_response(

90 self, content: str, max_tokens: int = MAX_RESPONSE_TOKENS, truncation_message: str | None = None

91 ) -> tuple[str, bool]:

92 """

93 Truncate response to fit within token limit using LiteLLM token counting.

95 Args:

96 content: Response content to truncate

97 max_tokens: Maximum tokens allowed

98 truncation_message: Custom message to append when truncated

100 Returns:

101 Tuple of (truncated_content, was_truncated)

102 """

103 # Count tokens using LiteLLM

104 current_tokens = self.count_tokens(content)

105

106 if current_tokens <= max_tokens:

107 return content, False

108

109 # Reserve tokens for truncation message

110 if truncation_message is None:

111 truncation_message = (

112 "\n\n[Response truncated due to length. "

113 "Use more specific filters or request detailed format for full results.]"

114 )

115

116 message_tokens = self.count_tokens(truncation_message)

117 available_tokens = max_tokens - message_tokens

118

119 if available_tokens <= 0: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true

120 logger.warning(

121 "Truncation message too long for max_tokens",

122 extra={"max_tokens": max_tokens, "message_tokens": message_tokens},

123 )

124 available_tokens = max(100, max_tokens - 50)

125

126 # Character-based truncation with token counting

127 # Estimate characters per token (roughly 4:1 ratio)

128 estimated_chars = available_tokens * 4

129 truncated_text = content[:estimated_chars]

130

131 # Iteratively adjust until within token limit

132 while self.count_tokens(truncated_text) > available_tokens and len(truncated_text) > 100: 132 ↛ 134line 132 didn't jump to line 134 because the condition on line 132 was never true

133 # Reduce by 10% each iteration

134 truncated_text = truncated_text[: int(len(truncated_text) * 0.9)]

135

136 final_tokens = self.count_tokens(truncated_text)

137

138 logger.info(

139 "Response truncated",

140 extra={

141 "original_tokens": current_tokens,

142 "truncated_tokens": final_tokens,

143 "truncation_ratio": final_tokens / current_tokens if current_tokens > 0 else 0,

144 },

145 )

146

147 return truncated_text + truncation_message, True

148

149 def format_response(

150 self, content: str, format_type: Literal["concise", "detailed"] = "concise", max_tokens: int | None = None

151 ) -> str:

152 """

153 Format response according to specified format type.

154

155 Args:

156 content: Original response content

157 format_type: "concise" or "detailed"

158 max_tokens: Override default token limits

159

160 Returns:

161 Formatted response

162 """

163 # Determine token limit based on format

164 if max_tokens is None:

165 max_tokens = DEFAULT_CONCISE_TOKENS if format_type == "concise" else DEFAULT_DETAILED_TOKENS

166

167 # Truncate if necessary

168 formatted_content, was_truncated = self.truncate_response(

169 content,

170 max_tokens=max_tokens,

171 truncation_message=(

172 f"\n\n[Response truncated to {format_type} format. Request 'detailed' format for more information.]"

173 if format_type == "concise"

174 else None

175 ),

176 )

177

178 return formatted_content

179

180 def extract_high_signal(self, data: dict[str, Any], exclude_fields: list[str] | None = None) -> dict[str, Any]:

181 """

182 Extract high-signal information from data, removing low-value technical fields.

183

184 Following Anthropic's guidance: "Avoid low-level technical identifiers

185 (uuid, mime_type) in favor of human-readable fields (name, file_type)"

186

187 Args:

188 data: Dictionary of data

189 exclude_fields: Additional fields to exclude

190

191 Returns:

192 Dictionary with only high-signal fields

193 """

194 # Default low-signal fields to exclude

195 low_signal_fields = {

196 "uuid",

197 "guid",

198 "mime_type",

199 "content_type",

200 "created_at_timestamp",

201 "updated_at_timestamp",

202 "internal_id",

203 "trace_id",

204 "span_id",

205 }

206

207 if exclude_fields:

208 low_signal_fields.update(exclude_fields)

209

210 # Filter out low-signal fields

211 filtered = {key: value for key, value in data.items() if key not in low_signal_fields}

212

213 return filtered

214

215

216# Global instance for convenience (uses default model)

217# Note: For model-specific counting, create ResponseOptimizer with specific model

218_optimizer = ResponseOptimizer()

219

220

221def count_tokens(text: str, model: str | None = None) -> int:

222 """

223 Count tokens in text using LiteLLM model-aware counting.

224

225 Args:

226 text: Text to count tokens for

227 model: Optional model name for accurate counting (uses global default if None)

228

229 Returns:

230 Number of tokens

231 """

232 if model:

233 # Use model-specific optimizer for accurate counting

234 optimizer = ResponseOptimizer(model=model)

235 return optimizer.count_tokens(text)

236 else:

237 # Use global optimizer with default model

238 return _optimizer.count_tokens(text)

239

240

241def truncate_response(

242 content: str, max_tokens: int = MAX_RESPONSE_TOKENS, truncation_message: str | None = None

243) -> tuple[str, bool]:

244 """Truncate response using global optimizer."""

245 return _optimizer.truncate_response(content, max_tokens, truncation_message)

246

247

248def format_response(

249 content: str, format_type: Literal["concise", "detailed"] = "concise", max_tokens: int | None = None

250) -> str:

251 """Format response using global optimizer."""

252 return _optimizer.format_response(content, format_type, max_tokens)

253

254

255def extract_high_signal(data: dict[str, Any], exclude_fields: list[str] | None = None) -> dict[str, Any]:

256 """Extract high-signal information using global optimizer."""

257 return _optimizer.extract_high_signal(data, exclude_fields)

Coverage for src / mcp_server_langgraph / utils / response_optimizer.py: 89%

59 statements