Coverage for src / mcp_server_langgraph / llm / verifier.py: 92%
159 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 00:43 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 00:43 +0000
1"""
2Work Verification Component for Agentic Workflows
4Implements Anthropic's "Verify Work" step in the agent loop:
5- LLM-as-judge: Use another LLM to evaluate outputs
6- Rules-based validation: Check against explicit criteria
7- Iterative refinement: Provide feedback for improvement
9References:
10- https://www.anthropic.com/engineering/building-agents-with-the-claude-agent-sdk
11"""
13from enum import Enum
14from typing import Any, Literal
16from langchain_core.messages import BaseMessage, HumanMessage
17from pydantic import BaseModel, Field
19from mcp_server_langgraph.llm.factory import create_verification_model
20from mcp_server_langgraph.observability.telemetry import logger, metrics, tracer
21import contextlib
24class VerificationCriterion(str, Enum):
25 """Criteria for evaluating agent outputs."""
27 ACCURACY = "accuracy" # Is the information correct?
28 COMPLETENESS = "completeness" # Does it fully answer the question?
29 CLARITY = "clarity" # Is it clear and well-structured?
30 RELEVANCE = "relevance" # Is it relevant to the user's request?
31 SAFETY = "safety" # Is it safe and appropriate?
32 SOURCES = "sources" # Are sources cited when appropriate?
35class VerificationResult(BaseModel):
36 """Result of output verification."""
38 passed: bool = Field(description="Whether verification passed")
39 overall_score: float = Field(ge=0.0, le=1.0, description="Overall quality score (0-1)")
40 criterion_scores: dict[str, float] = Field(default_factory=dict, description="Scores for individual criteria (0-1)")
41 feedback: str = Field(description="Actionable feedback for improvement")
42 requires_refinement: bool = Field(default=False, description="Whether output should be refined")
43 critical_issues: list[str] = Field(default_factory=list, description="Critical issues that must be fixed")
44 suggestions: list[str] = Field(default_factory=list, description="Optional suggestions for improvement")
47class OutputVerifier:
48 """
49 Verifies agent outputs using LLM-as-judge pattern.
51 Implements "Work Verification" from Anthropic's Agent SDK guide:
52 - Evaluates outputs against quality criteria
53 - Provides actionable feedback
54 - Supports iterative refinement
55 """
57 def __init__( # type: ignore[no-untyped-def]
58 self,
59 criteria: list[VerificationCriterion] | None = None,
60 quality_threshold: float = 0.7,
61 settings=None,
62 ):
63 """
64 Initialize output verifier.
66 Args:
67 criteria: Criteria to verify (default: all)
68 quality_threshold: Minimum score to pass (default: 0.7)
69 settings: Application settings (if None, uses global settings)
70 """
71 self.criteria = criteria or list(VerificationCriterion)
72 self.quality_threshold = quality_threshold
74 # Initialize dedicated LLM for verification (LLM-as-judge)
75 if settings is None:
76 from mcp_server_langgraph.core.config import settings as global_settings
78 settings = global_settings
80 self.llm = create_verification_model(settings)
82 logger.info(
83 "OutputVerifier initialized",
84 extra={
85 "criteria": [c.value for c in self.criteria],
86 "quality_threshold": quality_threshold,
87 },
88 )
90 async def verify_response(
91 self,
92 response: str,
93 user_request: str,
94 conversation_context: list[BaseMessage] | None = None,
95 verification_mode: Literal["standard", "strict", "lenient"] = "standard",
96 ) -> VerificationResult:
97 """
98 Verify agent response quality using LLM-as-judge.
100 Args:
101 response: Agent's response to verify
102 user_request: Original user request
103 conversation_context: Conversation history for context
104 verification_mode: Strictness level (default: standard)
106 Returns:
107 VerificationResult with scores and feedback
108 """
109 with tracer.start_as_current_span("verifier.verify_response") as span:
110 span.set_attribute("response.length", len(response))
111 span.set_attribute("verification.mode", verification_mode)
113 # Adjust threshold based on mode
114 threshold = self._get_threshold_for_mode(verification_mode)
116 # Build verification prompt using XML structure
117 verification_prompt = self._build_verification_prompt(response, user_request, conversation_context)
119 try:
120 # Get LLM judgment
121 # BUGFIX: Wrap prompt in HumanMessage to avoid string-to-character-list iteration
122 llm_response = await self.llm.ainvoke([HumanMessage(content=verification_prompt)])
124 # Get content and ensure it's a string
125 content = llm_response.content if hasattr(llm_response, "content") else str(llm_response)
126 judgment = str(content) if not isinstance(content, str) else content
128 # Parse judgment into structured result
129 result = self._parse_verification_judgment(judgment, threshold)
131 span.set_attribute("verification.passed", result.passed)
132 span.set_attribute("verification.overall_score", result.overall_score)
134 metrics.successful_calls.add(1, {"operation": "verify_response", "passed": str(result.passed).lower()})
136 logger.info(
137 "Response verified",
138 extra={
139 "passed": result.passed,
140 "overall_score": result.overall_score,
141 "requires_refinement": result.requires_refinement,
142 "critical_issues_count": len(result.critical_issues),
143 },
144 )
146 return result
148 except Exception as e:
149 logger.error(f"Verification failed: {e}", exc_info=True)
150 metrics.failed_calls.add(1, {"operation": "verify_response"})
151 span.record_exception(e)
153 # Fallback: Return permissive result
154 return VerificationResult(
155 passed=True, # Fail-open on verification errors
156 overall_score=0.5,
157 feedback=f"Verification system unavailable. Response accepted by default. Error: {e!s}",
158 requires_refinement=False,
159 )
161 def _build_verification_prompt(
162 self, response: str, user_request: str, conversation_context: list[BaseMessage] | None = None
163 ) -> str:
164 """
165 Build verification prompt using XML structure (Anthropic best practice).
167 Args:
168 response: Response to verify
169 user_request: Original request
170 conversation_context: Conversation history
172 Returns:
173 Structured verification prompt
174 """
175 # Format conversation context if provided
176 context_section = ""
177 if conversation_context:
178 context_text = "\n".join([f"{self._get_role(msg)}: {msg.content[:200]}..." for msg in conversation_context[-3:]])
179 context_section = f"""<conversation_context>
180{context_text}
181</conversation_context>
183"""
185 # Build criteria section
186 criteria_descriptions = {
187 VerificationCriterion.ACCURACY: "Is the information factually correct?",
188 VerificationCriterion.COMPLETENESS: "Does it fully address all aspects of the user's request?",
189 VerificationCriterion.CLARITY: "Is it clear, well-organized, and easy to understand?",
190 VerificationCriterion.RELEVANCE: "Is it directly relevant to what the user asked?",
191 VerificationCriterion.SAFETY: "Is it safe, appropriate, and free from harmful content?",
192 VerificationCriterion.SOURCES: "Are sources cited when making factual claims?",
193 }
195 criteria_text = "\n".join([f"- {criterion.value}: {criteria_descriptions[criterion]}" for criterion in self.criteria])
197 prompt = f"""<task>
198Evaluate the quality of an AI assistant's response to a user request.
199</task>
201<role>
202You are a quality evaluator for AI assistant responses.
203Your job is to provide objective, constructive feedback.
204</role>
206{context_section}<user_request>
207{user_request}
208</user_request>
210<assistant_response>
211{response}
212</assistant_response>
214<evaluation_criteria>
215Evaluate the response on these criteria (score each 0.0-1.0):
216{criteria_text}
217</evaluation_criteria>
219<instructions>
2201. Evaluate each criterion independently with a score from 0.0 to 1.0
2212. Calculate an overall score (average of all criteria)
2223. Identify any critical issues that must be fixed
2234. Provide actionable feedback for improvement
2245. Suggest whether the response requires refinement
225</instructions>
227<output_format>
228Provide your evaluation in this exact format:
230SCORES:
231- accuracy: [0.0-1.0]
232- completeness: [0.0-1.0]
233- clarity: [0.0-1.0]
234- relevance: [0.0-1.0]
235- safety: [0.0-1.0]
236- sources: [0.0-1.0]
238OVERALL: [0.0-1.0]
240CRITICAL_ISSUES:
241- [Issue 1, if any]
242- [Issue 2, if any]
244SUGGESTIONS:
245- [Suggestion 1]
246- [Suggestion 2]
248REQUIRES_REFINEMENT: [yes/no]
250FEEDBACK:
251[Detailed, actionable feedback in 2-3 sentences]
252</output_format>"""
254 return prompt
256 def _parse_verification_judgment(self, judgment: str, threshold: float) -> VerificationResult: # noqa: C901
257 """
258 Parse LLM judgment into structured VerificationResult.
260 Args:
261 judgment: Raw LLM judgment text
262 threshold: Quality threshold for passing
264 Returns:
265 Structured VerificationResult
266 """
267 # Extract scores using simple parsing (can be enhanced with regex)
268 criterion_scores = {}
269 overall_score = None # Will be set from OVERALL or calculated
270 critical_issues = []
271 suggestions = []
272 requires_refinement = False
273 feedback = ""
275 lines = judgment.split("\n")
276 current_section = None
278 for line in lines:
279 line = line.strip()
281 if line.startswith("SCORES:"):
282 current_section = "scores"
283 elif line.startswith("OVERALL:"):
284 current_section = "overall"
285 with contextlib.suppress(ValueError, IndexError):
286 overall_score = float(line.split(":")[1].strip())
287 elif line.startswith("CRITICAL_ISSUES:"):
288 current_section = "critical"
289 elif line.startswith("SUGGESTIONS:"):
290 current_section = "suggestions"
291 elif line.startswith("REQUIRES_REFINEMENT:"):
292 current_section = "refinement"
293 requires_refinement = "yes" in line.lower()
294 elif line.startswith("FEEDBACK:"):
295 current_section = "feedback"
296 elif current_section == "scores" and ":" in line:
297 try:
298 criterion, score = line.split(":", 1)
299 criterion = criterion.strip(" -")
300 score = float(score.strip()) # type: ignore[assignment]
301 criterion_scores[criterion] = score
302 except (ValueError, IndexError):
303 pass
304 elif current_section == "critical" and line.startswith("-"):
305 issue = line[1:].strip()
306 # Filter out "None" or empty issues
307 if issue and issue.lower() not in ["none", "n/a", "na"]:
308 critical_issues.append(issue)
309 elif current_section == "suggestions" and line.startswith("-"):
310 suggestion = line[1:].strip()
311 if suggestion and suggestion.lower() not in ["none", "n/a", "na"]: 311 ↛ 278line 311 didn't jump to line 278 because the condition on line 311 was always true
312 suggestions.append(suggestion)
313 elif current_section == "feedback" and line:
314 feedback += line + " "
316 feedback = feedback.strip() or "No specific feedback provided."
318 # Calculate overall score from criteria if not explicitly provided in OVERALL
319 if overall_score is None:
320 if criterion_scores: 320 ↛ 321line 320 didn't jump to line 321 because the condition on line 320 was never true
321 overall_score = sum(criterion_scores.values()) / len(criterion_scores) # type: ignore[arg-type]
322 logger.info("Calculated overall score from criterion scores")
323 else:
324 overall_score = 0.5 # Default fallback
325 logger.warning("Failed to parse both overall score and criterion scores, using default")
327 passed = overall_score >= threshold and len(critical_issues) == 0
329 return VerificationResult(
330 passed=passed,
331 overall_score=overall_score,
332 criterion_scores=criterion_scores, # type: ignore[arg-type]
333 feedback=feedback,
334 requires_refinement=requires_refinement or not passed,
335 critical_issues=critical_issues,
336 suggestions=suggestions,
337 )
339 def _get_threshold_for_mode(self, mode: Literal["standard", "strict", "lenient"]) -> float:
340 """Get quality threshold based on verification mode."""
341 thresholds = {
342 "strict": self.quality_threshold + 0.1,
343 "standard": self.quality_threshold,
344 "lenient": self.quality_threshold - 0.1,
345 }
346 # Round to avoid floating point precision issues in tests
347 threshold = thresholds.get(mode, self.quality_threshold)
348 return round(max(0.0, min(1.0, threshold)), 2)
350 def _get_role(self, message: BaseMessage) -> str:
351 """Get role label for message."""
352 from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
354 if isinstance(message, HumanMessage):
355 return "User"
356 elif isinstance(message, AIMessage): 356 ↛ 358line 356 didn't jump to line 358 because the condition on line 356 was always true
357 return "Assistant"
358 elif isinstance(message, SystemMessage):
359 return "System"
360 else:
361 return "Message"
363 async def verify_with_rules(self, response: str, rules: dict[str, Any]) -> VerificationResult:
364 """
365 Verify response against explicit rules (rules-based validation).
367 Alternative to LLM-as-judge for deterministic checks.
369 Args:
370 response: Response to verify
371 rules: Dictionary of rules to check
373 Returns:
374 VerificationResult based on rule compliance
376 Example rules:
377 {
378 "min_length": 50,
379 "max_length": 2000,
380 "required_keywords": ["example", "explanation"],
381 "forbidden_keywords": ["sorry", "I don't know"],
382 "must_include_code": True
383 }
384 """
385 issues = []
386 suggestions = []
387 criterion_scores = {}
389 # Check length constraints
390 if "min_length" in rules and len(response) < rules["min_length"]:
391 issues.append(f"Response too short (minimum: {rules['min_length']} characters)")
392 criterion_scores["completeness"] = 0.3
394 if "max_length" in rules and len(response) > rules["max_length"]: 394 ↛ 395line 394 didn't jump to line 395 because the condition on line 394 was never true
395 suggestions.append(f"Response could be more concise (maximum: {rules['max_length']} characters)")
396 criterion_scores["clarity"] = 0.7
398 # Check required keywords
399 if "required_keywords" in rules:
400 missing = [kw for kw in rules["required_keywords"] if kw.lower() not in response.lower()]
401 if missing:
402 issues.append(f"Missing required keywords: {', '.join(missing)}")
403 criterion_scores["completeness"] = 0.5
405 # Check forbidden keywords
406 if "forbidden_keywords" in rules:
407 found = [kw for kw in rules["forbidden_keywords"] if kw.lower() in response.lower()]
408 if found: 408 ↛ 413line 408 didn't jump to line 413 because the condition on line 408 was always true
409 issues.append(f"Contains forbidden keywords: {', '.join(found)}")
410 criterion_scores["quality"] = 0.4
412 # Check code inclusion
413 if rules.get("must_include_code") and "```" not in response:
414 issues.append("Response must include code examples")
415 criterion_scores["completeness"] = 0.6
417 # Calculate overall score
418 overall_score = (
419 1.0 if not issues else (sum(criterion_scores.values()) / len(criterion_scores) if criterion_scores else 0.5)
420 )
421 passed = len(issues) == 0
423 feedback = "All rule checks passed." if passed else f"Failed {len(issues)} rule check(s). " + "; ".join(issues)
425 logger.info(
426 "Rules-based verification completed",
427 extra={
428 "passed": passed,
429 "issues_count": len(issues),
430 "rules_checked": len(rules),
431 },
432 )
434 return VerificationResult(
435 passed=passed,
436 overall_score=overall_score,
437 criterion_scores=criterion_scores,
438 feedback=feedback,
439 requires_refinement=not passed,
440 critical_issues=issues,
441 suggestions=suggestions,
442 )
445# Convenience function for easy import
446async def verify_output(
447 response: str,
448 user_request: str,
449 conversation_context: list[BaseMessage] | None = None,
450 verifier: OutputVerifier | None = None,
451) -> VerificationResult:
452 """
453 Verify agent output (convenience function).
455 Args:
456 response: Response to verify
457 user_request: Original user request
458 conversation_context: Conversation history
459 verifier: OutputVerifier instance (creates new if None)
461 Returns:
462 VerificationResult
463 """
464 if verifier is None: 464 ↛ 467line 464 didn't jump to line 467 because the condition on line 464 was always true
465 verifier = OutputVerifier()
467 return await verifier.verify_response(response, user_request, conversation_context)