Complete guide to implementing production-grade LLM evaluation with the M.A.G.I. framework, LlamaIndex, LangChain, and Langfuse. Learn why metrics matter and how to build reliable AI systems.
Complete implementation guide for the M.A.G.I. framework: Measure, Automate, Govern, Improvement.
Quantify quality with targeted metrics for your specific use case
Make quality checks automatic gates in your CI/CD pipeline
Own the framework and keep it current with business needs
Continuous refinement based on data
# Complete M.A.G.I. Framework Implementation
import asyncio
from datetime import datetime
from typing import Dict, List, Any, Optional
class MAGIFramework:
"""
Complete M.A.G.I. Framework implementation for production LLM evaluation.
"""
def __init__(self, environment: str = "production"):
self.environment = environment
self.quality_gates = {
"development": {
"faithfulness": 0.70,
"answer_relevancy": 0.65,
"contextual_precision": 0.60,
"overall_quality": 0.65
},
"production": {
"faithfulness": 0.90,
"answer_relevancy": 0.85,
"contextual_precision": 0.75,
"overall_quality": 0.85
}
}
# M - MEASURE: Quantify quality with targeted metrics
async def measure_quality(self,
query: str,
response: str,
contexts: List[str],
trace_id: str,
expected_answer: str = None) -> Dict[str, Any]:
"""
Comprehensive quality measurement across all relevant metrics.
"""
evaluation_results = {}
# 1. Faithfulness Evaluation
faithfulness_score = await self._evaluate_faithfulness(response, contexts)
evaluation_results["faithfulness"] = {
"score": faithfulness_score,
"feedback": "Response grounded in provided context" if faithfulness_score > 0.8 else "Response may contain hallucinations"
}
# 2. Answer Relevancy Evaluation
relevancy_score = await self._evaluate_relevancy(query, response, contexts)
evaluation_results["answer_relevancy"] = {
"score": relevancy_score,
"feedback": "Response directly addresses query" if relevancy_score > 0.8 else "Response may be off-topic"
}
# 3. Contextual Precision Evaluation
precision_score = await self._evaluate_contextual_precision(query, contexts, expected_answer)
evaluation_results["contextual_precision"] = {
"score": precision_score,
"feedback": "Retrieved highly relevant context" if precision_score > 0.7 else "Context relevance could be improved"
}
# 4. Overall Quality Score
scores = [result["score"] for result in evaluation_results.values()]
overall_score = sum(scores) / len(scores)
evaluation_results["overall_quality"] = {
"score": overall_score,
"component_scores": scores
}
return evaluation_results
# A - AUTOMATE: Make quality checks automatic gates
def automate_quality_gates(self,
evaluation_results: Dict[str, Any],
stage: str = "production") -> Dict[str, Any]:
"""
Automated quality gate checking with environment-specific thresholds.
"""
gates = self.quality_gates.get(stage, self.quality_gates["development"])
gate_results = {}
critical_failures = []
for metric, threshold in gates.items():
if metric in evaluation_results:
score = evaluation_results[metric]["score"]
passes = score >= threshold
gate_results[metric] = {
"score": score,
"threshold": threshold,
"passes": passes
}
if not passes:
critical_failures.append({
"metric": metric,
"score": score,
"threshold": threshold,
"gap": threshold - score
})
overall_pass = len(critical_failures) == 0
return {
"overall_pass": overall_pass,
"stage": stage,
"gate_results": gate_results,
"critical_failures": critical_failures,
"timestamp": datetime.utcnow().isoformat()
}
# G - GOVERN: Own the framework and keep it current
def govern_framework(self,
evaluation_history: List[Dict[str, Any]],
human_feedback: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Framework governance including drift detection and dataset management.
"""
governance_report = {
"framework_health": self._assess_framework_health(evaluation_history),
"golden_dataset_status": self._assess_golden_dataset(human_feedback),
"threshold_recommendations": self._recommend_threshold_updates(evaluation_history),
"governance_timestamp": datetime.utcnow().isoformat()
}
return governance_report
# I - INSTRUMENT: Full-funnel logging and tracing
def instrument_evaluation(self,
trace_id: str,
evaluation_results: Dict[str, Any],
gate_results: Dict[str, Any],
metadata: Dict[str, Any] = None) -> str:
"""
Comprehensive instrumentation for complete observability.
"""
# Log to observability platform (Langfuse, etc.)
instrumentation_data = {
"trace_id": trace_id,
"evaluation_results": evaluation_results,
"gate_results": gate_results,
"metadata": metadata or {},
"framework": "M.A.G.I.",
"version": "2.0",
"timestamp": datetime.utcnow().isoformat()
}
# In production, this would send to your observability platform
print(f"[MAGI-INSTRUMENT] Logged evaluation data for trace {trace_id}")
return trace_id
# Complete M.A.G.I. execution
async def execute_magi_framework(self,
query: str,
response: str,
contexts: List[str],
trace_id: str,
stage: str = "production",
expected_answer: str = None,
metadata: Dict[str, Any] = None) -> Dict[str, Any]:
"""
Execute complete M.A.G.I. framework evaluation.
"""
# M - Measure quality
evaluation_results = await self.measure_quality(
query=query,
response=response,
contexts=contexts,
trace_id=trace_id,
expected_answer=expected_answer
)
# A - Automate quality gates
gate_results = self.automate_quality_gates(
evaluation_results=evaluation_results,
stage=stage
)
# I - Instrument everything
instrumented_trace_id = self.instrument_evaluation(
trace_id=trace_id,
evaluation_results=evaluation_results,
gate_results=gate_results,
metadata=metadata
)
# Generate business impact assessment
business_impact = self._assess_business_impact(evaluation_results, gate_results)
# Generate recommendations
recommendations = self._generate_recommendations(evaluation_results, gate_results)
return {
"overall_score": evaluation_results["overall_quality"]["score"],
"quality_gates": gate_results["overall_pass"],
"business_impact": business_impact,
"recommendations": recommendations,
"trace_id": instrumented_trace_id,
"evaluation_results": evaluation_results,
"gate_results": gate_results
}
# Helper methods for evaluation
async def _evaluate_faithfulness(self, response: str, contexts: List[str]) -> float:
"""Evaluate faithfulness using QAG method."""
# Implementation would use actual QAG evaluation
# For demo purposes, returning simulated score
return 0.95 if "employees with 5+ years" in response.lower() else 0.70
async def _evaluate_relevancy(self, query: str, response: str, contexts: List[str]) -> float:
"""Evaluate answer relevancy using LLM-as-Judge."""
# Implementation would use actual LLM judge
return 0.88 if "pto" in query.lower() and "days" in response.lower() else 0.65
async def _evaluate_contextual_precision(self, query: str, contexts: List[str], expected_answer: str) -> float:
"""Evaluate contextual precision."""
# Implementation would check context relevance
return 0.82 if contexts and len(contexts) > 0 else 0.50
def _assess_framework_health(self, evaluation_history: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Assess framework health and detect drift."""
return {
"drift_detected": False,
"evaluation_count": len(evaluation_history),
"health_score": 0.92
}
def _assess_golden_dataset(self, human_feedback: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Assess golden dataset status."""
return {
"needs_refresh": len(human_feedback) > 100,
"feedback_count": len(human_feedback),
"quality_score": 0.88
}
def _recommend_threshold_updates(self, evaluation_history: List[Dict[str, Any]]) -> List[str]:
"""Recommend threshold updates based on performance."""
return ["Consider increasing faithfulness threshold to 0.92 based on consistent high performance"]
def _assess_business_impact(self, evaluation_results: Dict[str, Any], gate_results: Dict[str, Any]) -> str:
"""Assess business impact of evaluation results."""
overall_score = evaluation_results["overall_quality"]["score"]
if overall_score >= 0.90:
return "low"
elif overall_score >= 0.75:
return "medium"
else:
return "high"
def _generate_recommendations(self, evaluation_results: Dict[str, Any], gate_results: Dict[str, Any]) -> List[str]:
"""Generate actionable recommendations."""
recommendations = []
for failure in gate_results.get("critical_failures", []):
metric = failure["metric"]
gap = failure["gap"]
if metric == "faithfulness":
recommendations.append(f"Improve context relevance to increase faithfulness by {gap:.2f}")
elif metric == "answer_relevancy":
recommendations.append(f"Refine prompt engineering to improve relevancy by {gap:.2f}")
elif metric == "contextual_precision":
recommendations.append(f"Enhance retrieval ranking to improve precision by {gap:.2f}")
if not recommendations:
recommendations.append("All quality gates passed - maintain current performance")
return recommendations
# Example usage
async def magi_example():
# Initialize M.A.G.I. Framework
magi = MAGIFramework(environment="production")
# Execute complete framework
result = await magi.execute_magi_framework(
query="How many PTO days do I get after 5 years?",
response="Employees with 5+ years of service accrue 20 days of PTO annually.",
contexts=["Employees with 5+ years of service accrue 20 days of PTO annually."],
trace_id="trace-123",
stage="production",
expected_answer="20 days",
metadata={"user_tenure": "5_years", "policy_version": "2025"}
)
print(f"M.A.G.I. Evaluation Complete:")
print(f"Overall Score: {result['overall_score']:.3f}")
print(f"Quality Gates: {result['quality_gates']}")
print(f"Business Impact: {result['business_impact']}")
print(f"Recommendations: {result['recommendations']}")
return result
# Run the example
# asyncio.run(magi_example())