Observability Best Practices

Best practices for implementing effective tracing in AI applications, LLM systems, and RAG pipelines

Follow these best practices to implement effective tracing in your AI applications, ensuring you get maximum value from observability data.

🎯 Meaningful Span Names

Descriptive and Consistent Names

# ✅ Good span names
"llm-completion"
"document-retrieval"
"user-authentication"
"payment-processing"
 
# ❌ Poor span names
"function1"
"process"
"api_call"
"step"

Action-Oriented Naming

# Good: Action-oriented
"classify-query"
"generate-response"
"validate-input"
"retrieve-documents"
 
# Bad: State-oriented
"query-classification"
"response-generation"
"input-validation"
"document-retrieval"

Hierarchical Naming for Complex Operations

# Use hierarchical naming for complex operations
"rag-pipeline"
"rag-pipeline.query-analysis"
"rag-pipeline.document-retrieval"
"rag-pipeline.answer-generation"
 
"multi-agent-workflow"
"multi-agent-workflow.task-planning"
"multi-agent-workflow.agent-researcher"
"multi-agent-workflow.result-synthesis"

📊 Rich Attributes

Include Context for Debugging and Analysis

# ✅ Rich attributes using context managers
with trace_llm(model="gpt-4", operation="llm-call") as span:
    span.set_attributes({
        "user.id": user_id,
        "user.plan": "premium",
        "llm.model": "gpt-4",
        "llm.temperature": 0.7,
        "prompt.category": "technical_question",
        "response.confidence": 0.92
    })
 
# ❌ Minimal attributes
span.set_attributes({"status": "ok"})

AI-Specific Attribute Categories

# LLM Attributes
span.set_attributes({
    "llm.model": "gpt-4",
    "llm.provider": "openai",
    "llm.temperature": 0.7,
    "llm.max_tokens": 1000,
    "llm.top_p": 0.9
})
 
# Cost Attributes
span.set_attributes({
    "llm.tokens.input": response.usage.prompt_tokens,
    "llm.tokens.output": response.usage.completion_tokens,
    "llm.cost.estimated": calculate_cost(response.usage)
})
 
# User Attributes
span.set_attributes({
    "user.id": user_id,
    "user.plan": "premium",
    "user.location": "us-west",
    "user.tier": "enterprise"
})
 
# Content Attributes
span.set_attributes({
    "prompt.length": len(prompt),
    "response.length": len(response),
    "content.type": "technical_question",
    "content.language": "en"
})
 
# Quality Attributes
span.set_attributes({
    "relevance.score": 0.85,
    "confidence.level": 0.92,
    "accuracy.rating": "high"
})

Performance Attributes

import time
import psutil
from noveum_trace.context_managers import trace_operation
 
def expensive_operation_with_performance_tracking():
    with trace_operation("expensive-operation") as span:
        start_memory = psutil.Process().memory_info().rss / 1024 / 1024  # MB
        start_time = time.time()
 
        try:
            result = expensive_operation()
 
            end_time = time.time()
            end_memory = psutil.Process().memory_info().rss / 1024 / 1024  # MB
 
            span.set_attributes({
                'performance.duration_ms': (end_time - start_time) * 1000,
                'performance.memory_delta_mb': end_memory - start_memory,
                'performance.cpu_intensive': True
            })
 
            return result
        except Exception as error:
            span.set_attributes({'performance.failed': True})
            raise

🛡️ Error Handling

Comprehensive Error Capture

from noveum_trace.context_managers import trace_operation
from datetime import datetime
 
with trace_operation("expensive-ai-operation") as span:
    try:
        result = expensive_ai_operation()
        span.set_attributes({
            "operation.success": True,
            "operation.result_quality": assess_quality(result)
        })
    except Exception as e:
        span.set_attributes({
            "operation.success": False,
            "error.type": type(e).__name__,
            "error.message": str(e)
        })
        span.add_event("operation.failed", {
            "error.timestamp": datetime.now().isoformat(),
            "error.recoverable": is_recoverable_error(e)
        })
        raise

Error Context and Recovery

with trace_operation("api-call") as span:
    retry_count = 0
    max_retries = 3
    
    while retry_count <= max_retries:
        try:
            span.add_event("api.call.attempted", {
                "attempt": retry_count + 1,
                "max_retries": max_retries
            })
            
            result = make_api_call()
            
            span.add_event("api.call.succeeded", {
                "attempt": retry_count + 1,
                "duration_ms": time.time() - start_time
            })
            
            break
            
        except Exception as e:
            retry_count += 1
            
            span.add_event("api.call.failed", {
                "attempt": retry_count,
                "error.type": type(e).__name__,
                "error.message": str(e),
                "will_retry": retry_count <= max_retries
            })
            
            if retry_count > max_retries:
                span.set_status("error", f"Max retries exceeded: {str(e)}")
                raise

🧠 AI-Specific Tracing Patterns

RAG Pipeline Tracing

from noveum_trace.context_managers import trace_operation, trace_llm
 
def rag_query(question: str) -> str:
    with trace_operation("rag-pipeline") as main_span:
        # Phase 1: Query understanding
        with trace_operation("query-analysis") as step:
            intent = analyze_query_intent(question)
            step.set_attributes({
                "query.intent": intent,
                "query.complexity": get_complexity_score(question),
                "query.length": len(question)
            })
 
        # Phase 2: Retrieval
        with trace_operation("document-retrieval") as step:
            embeddings = generate_embeddings(question)
            documents = vector_search(embeddings, k=5)
 
            step.set_attributes({
                "retrieval.query_embedding_time": embedding_time,
                "retrieval.search_time": search_time,
                "retrieval.documents_found": len(documents),
                "retrieval.avg_similarity": avg_similarity(documents)
            })
 
        # Phase 3: Generation
        with trace_llm(model="gpt-4", operation="answer-generation") as step:
            context = build_context(documents)
            answer = generate_answer_with_context(question, context)
 
            step.set_attributes({
                "generation.context_length": len(context),
                "generation.answer_length": len(answer),
                "generation.model": "gpt-4"
            })
 
        return answer

Multi-Agent Tracing

from noveum_trace.context_managers import trace_operation, trace_llm
 
def multi_agent_task(task: str):
    with trace_operation("multi-agent-task") as main_span:
        # Agent coordination
        with trace_operation("task-planning") as planning_span:
            planning_span.set_attributes({
                "task.type": classify_task(task),
                "agents.required": ['researcher', 'writer', 'reviewer']
            })
            plan = planning_agent.create_plan(task)
 
        # Individual agent execution
        results = []
        for step in plan.steps:
            with trace_operation(f"agent-{step.agent}") as agent_span:
                agent_span.set_attributes({
                    "agent.name": step.agent,
                    "agent.task": step.task,
                    "agent.tools": step.tools
                })
 
                agent_result = execute_agent_step(step)
 
                agent_span.set_attributes({
                    "agent.success": agent_result.success,
                    "agent.confidence": agent_result.confidence
                })
 
                results.append(agent_result)
 
        # Final synthesis
        with trace_operation("result-synthesis") as synthesis_span:
            return synthesize_results(results)

LLM Call Tracing

from noveum_trace.context_managers import trace_llm
import openai
 
def call_llm(model: str, prompt: str, user_id: str):
    with trace_llm(model=model, operation="llm-call") as span:
        # Add attributes for context
        span.set_attributes({
            "llm.model": model,
            "llm.provider": "openai",
            "user.id": user_id,
            "prompt.length": len(prompt),
            "prompt.type": "user_query"
        })
 
        response = openai.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}]
        )
 
        # Add response attributes
        span.set_attributes({
            "llm.tokens.input": response.usage.prompt_tokens,
            "llm.tokens.output": response.usage.completion_tokens,
            "llm.cost.estimated": calculate_cost(response.usage)
        })
 
        return response.choices[0].message.content

🎪 Event Patterns

Start/Complete Pattern

from noveum_trace.context_managers import trace_operation
from datetime import datetime
 
def process_document(doc_id: str):
    with trace_operation("document-processing") as span:
        span.add_event("processing.started", {
            "document.id": doc_id,
            "timestamp": datetime.now().isoformat()
        })
 
        try:
            # Processing logic
            chunks = split_document(doc_id)
            span.add_event("document.chunked", {
                "chunks.count": len(chunks),
                "chunks.avg_size": sum(len(c) for c in chunks) / len(chunks)
            })
 
            embeddings = generate_embeddings(chunks)
            span.add_event("embeddings.generated", {
                "embeddings.count": len(embeddings),
                "embeddings.model": "text-embedding-ada-002"
            })
 
            return embeddings
 
        except Exception as e:
            span.add_event("processing.failed", {
                "error.type": type(e).__name__,
                "error.message": str(e)
            })
            raise

State Change Events

with trace_operation("ai-completion") as span:
    # Initial state
    span.add_event("ai.initialization", {
        "model": "gpt-3.5-turbo",
        "temperature": 0.7
    })
    
    # State change
    if query_complexity > 0.8:
        span.add_event("ai.model.upgraded", {
            "from.model": "gpt-3.5-turbo",
            "to.model": "gpt-4",
            "reason": "high_complexity"
        })
    
    # Final state
    span.add_event("ai.completion.ready", {
        "final.model": "gpt-4",
        "tokens.estimated": 200
    })

🔍 Debugging Strategies

Common Debugging Scenarios

1. Slow Response Times

Look for spans with high duration:
- Is the LLM call taking too long?
- Is document retrieval the bottleneck?
- Are there unnecessary sequential operations?

2. High Costs

Analyze cost-related attributes:
- Which models are being used?
- How many tokens are being consumed?
- Are there redundant API calls?

3. Quality Issues

Examine quality attributes:
- What's the confidence score of responses?
- How relevant are retrieved documents?
- Are there patterns in failed operations?

4. Error Patterns

Filter by error events and attributes:
- What types of errors are most common?
- Do errors correlate with specific users/inputs?
- Are errors happening at specific times?

Performance Monitoring

# Monitor key performance metrics
span.set_attributes({
    "performance.duration_ms": duration_ms,
    "performance.memory_usage_mb": memory_usage,
    "performance.cpu_usage_percent": cpu_usage,
    "performance.cache_hit_rate": cache_hit_rate
})

Cost Monitoring

# Track AI costs
span.set_attributes({
    "cost.tokens_input": input_tokens,
    "cost.tokens_output": output_tokens,
    "cost.usd_estimated": estimated_cost,
    "cost.model": model_name
})

🎯 Context Management

Consistent Context Propagation

# Set context at the trace level
with trace_operation("customer-query") as main_span:
    main_span.set_attributes({
        "customer.id": customer_id,
        "query.type": "support",
        "session.id": session_id
    })
    
    # Child spans inherit context
    with trace_operation("classify-query") as child_span:
        # This span automatically has customer.id and query.type
        classification = classify_query(query)

Business Context

# Include business-relevant context
span.set_attributes({
    "business.operation": "customer_support",
    "business.priority": "high",
    "business.customer_tier": "premium",
    "business.region": "us-west",
    "business.feature": "chatbot"
})

🚀 Next Steps

Now that you understand tracing concepts best practices, explore these related concepts:

Traces Best Practices - Best practices for complete request journeys
Spans Best Practices - Best practices for individual operations
Attributes Best Practices - Best practices for metadata and context
Events Best Practices - Best practices for point-in-time occurrences

Remember: Good observability is not about collecting all possible data, but about collecting the right data that helps you understand, debug, and optimize your AI applications.