Traces Best Practices

Best practices for creating effective traces in your AI applications

Follow these best practices to create effective, meaningful traces that provide valuable insights into your AI applications.

🎯 Trace Naming

Descriptive and Consistent Names

# Good: Descriptive and consistent
trace_operation("customer-support-query")
trace_operation("rag-pipeline")
trace_operation("multi-agent-workflow")
 
# Bad: Generic or unclear
trace_operation("process")
trace_operation("main")
trace_operation("function")

Use Action-Oriented Names

# Good: Action-oriented
trace_operation("process-customer-query")
trace_operation("generate-ai-response")
trace_operation("validate-user-input")
 
# Bad: State-oriented
trace_operation("customer-query")
trace_operation("ai-response")
trace_operation("user-input")

📊 Attribute Organization

# Group related attributes logically
span.set_attributes({
    # Customer context
    "customer.id": customer_id,
    "customer.tier": customer_tier,
    "customer.region": customer_region,
    
    # Query context
    "query.type": query_type,
    "query.length": len(query),
    "query.language": query_language,
    
    # AI context
    "ai.model": model_name,
    "ai.provider": provider,
    "ai.temperature": temperature
})

Use Consistent Naming Conventions

# Use hierarchical naming with dots
span.set_attributes({
    "business.customer_id": "cust_123",
    "business.operation": "support_query",
    "business.priority": "high",
    
    "ai.model": "gpt-4",
    "ai.provider": "openai",
    "ai.temperature": 0.7,
    
    "system.duration_ms": 1800,
    "system.status": "success"
})

🛡️ Error Handling

Comprehensive Error Tracking

with trace_operation("risky-operation") as span:
    try:
        result = risky_operation()
        span.set_status("success")
        return result
    except Exception as e:
        span.set_status("error", str(e))
        span.add_event("error.occurred", {
            "error.type": type(e).__name__,
            "error.message": str(e),
            "error.stack": traceback.format_exc()
        })
        raise

Error Context and Recovery

with trace_operation("api-call") as span:
    retry_count = 0
    max_retries = 3
    
    while retry_count <= max_retries:
        try:
            result = make_api_call()
            span.set_status("success")
            return result
        except Exception as e:
            retry_count += 1
            span.add_event("error.retry", {
                "error.type": type(e).__name__,
                "retry.attempt": retry_count,
                "retry.max_attempts": max_retries,
                "retry.will_retry": retry_count <= max_retries
            })
            
            if retry_count > max_retries:
                span.set_status("error", f"Max retries exceeded: {str(e)}")
                raise

🎪 Event Timing

Meaningful Event Placement

with trace_operation("process-query") as span:
    # Start event with context
    span.add_event("operation.started", {
        "timestamp": time.time(),
        "input.size": len(query),
        "input.type": "text"
    })
    
    # Process the query
    result = process_query(query)
    
    # Completion event with results
    span.add_event("operation.completed", {
        "timestamp": time.time(),
        "output.size": len(result),
        "success": True,
        "duration_ms": time.time() - start_time
    })

State Change Events

with trace_operation("ai-completion") as span:
    # Initial state
    span.add_event("ai.initialization", {
        "model": "gpt-3.5-turbo",
        "temperature": 0.7
    })
    
    # State change
    if query_complexity > 0.8:
        span.add_event("ai.model.upgraded", {
            "from.model": "gpt-3.5-turbo",
            "to.model": "gpt-4",
            "reason": "high_complexity"
        })
    
    # Final state
    span.add_event("ai.completion.ready", {
        "final.model": "gpt-4",
        "tokens.estimated": 200
    })

🔗 Span Hierarchy

Logical Parent-Child Relationships

with trace_operation("customer-support-query") as parent_span:
    # Set context at parent level
    parent_span.set_attributes({
        "customer.id": customer_id,
        "query.type": "support"
    })
    
    # Child spans inherit context
    with trace_operation("classify-query") as child_span:
        classification = classify_query(query)
    
    with trace_operation("generate-response") as child_span:
        response = generate_response(query, classification)
    
    # Parent can aggregate child results
    parent_span.set_attributes({
        "classification.result": classification,
        "response.length": len(response)
    })

Avoid Deep Nesting

# Good: Reasonable nesting depth
with trace_operation("main-operation") as span:
    with trace_operation("sub-operation-1") as sub_span:
        result1 = operation_1()
    
    with trace_operation("sub-operation-2") as sub_span:
        result2 = operation_2()
 
# Bad: Too deep nesting
with trace_operation("level1") as span1:
    with trace_operation("level2") as span2:
        with trace_operation("level3") as span3:
            with trace_operation("level4") as span4:
                with trace_operation("level5") as span5:
                    result = operation()

📈 Performance Considerations

Minimize Overhead

# Good: Essential attributes only
span.set_attributes({
    "customer.id": customer_id,
    "query.type": query_type,
    "ai.model": model_name
})
 
# Bad: Too many attributes
span.set_attributes({
    "customer.id": customer_id,
    "customer.name": customer_name,
    "customer.email": customer_email,
    "customer.phone": customer_phone,
    "customer.address": customer_address,
    # ... 50 more attributes
})

Use Conditional Attributes

# Only add attributes when relevant
if customer_tier == "premium":
    span.set_attribute("customer.priority", "high")
    span.set_attribute("ai.model", "gpt-4")
else:
    span.set_attribute("customer.priority", "normal")
    span.set_attribute("ai.model", "gpt-3.5-turbo")

🎯 Business Context

Include Business Metrics

span.set_attributes({
    "business.operation": "customer_support",
    "business.priority": "high",
    "business.customer_tier": "premium",
    "business.region": "us-west",
    "business.feature": "chatbot",
    "business.cost_center": "support_team"
})

Track Business Outcomes

span.add_event("business.outcome", {
    "customer.satisfaction": 4.5,
    "resolution.time_minutes": 15,
    "escalation.required": False,
    "follow_up.needed": True
})

🔍 Debugging Support

Include Debug Information

span.set_attributes({
    "debug.query_id": query_id,
    "debug.session_id": session_id,
    "debug.user_agent": request.headers.get("user-agent"),
    "debug.timestamp": time.time()
})

Trace Correlation

# Use consistent trace IDs across services
trace_id = generate_trace_id()
span.set_attribute("trace.correlation_id", trace_id)
 
# Pass trace ID to external services
external_service_call(trace_id=trace_id)

🚀 Next Steps

Now that you understand trace best practices, explore these related concepts:

Spans Best Practices - Best practices for individual operations
Attributes Best Practices - Best practices for metadata and context
Events Best Practices - Best practices for point-in-time occurrences

Effective traces are the foundation of observability. By following these best practices, you'll create traces that provide valuable insights into your AI applications.