Spans Best Practices

Best practices for creating effective spans in your AI applications

Follow these best practices to create meaningful, well-structured spans that provide clear insights into your operations.

🎯 Span Naming

Clear and Descriptive Names

# Good: Clear and descriptive
trace_operation("gpt-4-completion")
trace_operation("vector-search")
trace_operation("customer-data-processing")
 
# Bad: Generic or unclear
trace_operation("process")
trace_operation("call")
trace_operation("function")

Use Action-Oriented Names

# Good: Action-oriented
trace_operation("classify-query")
trace_operation("generate-response")
trace_operation("validate-input")
 
# Bad: State-oriented
trace_operation("query-classification")
trace_operation("response-generation")
trace_operation("input-validation")

📊 Attribute Naming

Consistent Naming Conventions

# Use consistent naming conventions
span.set_attributes({
    "ai.model": "gpt-4",           # ai.* for AI-specific
    "ai.provider": "openai",       # attributes
    "business.customer_id": "123", # business.* for business
    "system.duration_ms": 1800     # system.* for system
})

Hierarchical Naming

# Use dot notation for logical hierarchies
span.set_attributes({
    "ai.model": "gpt-4",
    "ai.provider": "openai",
    "ai.temperature": 0.7,
    "ai.max_tokens": 1000,
    
    "customer.id": "cust_123",
    "customer.tier": "premium",
    "customer.region": "us-west",
    
    "query.type": "technical_support",
    "query.priority": "high",
    "query.language": "en"
})

🎪 Event Timing

Add Events at Meaningful Points

# Add events at meaningful points
span.add_event("operation.started", {"timestamp": time.time()})
 
# Do the work
result = perform_operation()
 
span.add_event("operation.completed", {
    "timestamp": time.time(),
    "result.size": len(result)
})

State Change Events

with trace_operation("ai-completion") as span:
    # Initial state
    span.add_event("ai.initialization", {
        "model": "gpt-3.5-turbo",
        "temperature": 0.7
    })
    
    # State change
    if query_complexity > 0.8:
        span.add_event("ai.model.upgraded", {
            "from.model": "gpt-3.5-turbo",
            "to.model": "gpt-4",
            "reason": "high_complexity"
        })
    
    # Final state
    span.add_event("ai.completion.ready", {
        "final.model": "gpt-4",
        "tokens.estimated": 200
    })

🛡️ Error Handling

Comprehensive Error Tracking

with trace_operation("risky-operation") as span:
    try:
        result = risky_operation()
        span.set_status("success")
        return result
    except Exception as e:
        span.set_status("error", str(e))
        span.add_event("error.occurred", {
            "error.type": type(e).__name__,
            "error.message": str(e)
        })
        raise

Error Context and Recovery

with trace_operation("api-call") as span:
    retry_count = 0
    max_retries = 3
    
    while retry_count <= max_retries:
        try:
            result = make_api_call()
            span.set_status("success")
            return result
        except Exception as e:
            retry_count += 1
            
            span.add_event("api.call.failed", {
                "attempt": retry_count,
                "error.type": type(e).__name__,
                "error.message": str(e),
                "will_retry": retry_count <= max_retries
            })
            
            if retry_count > max_retries:
                span.set_status("error", f"Max retries exceeded: {str(e)}")
                raise

🔗 Parent-Child Relationships

Logical Hierarchy

with trace_operation("parent-operation") as parent_span:
    # Child span 1
    with trace_operation("child-operation-1") as child1_span:
        result1 = operation_1()
    
    # Child span 2
    with trace_operation("child-operation-2") as child2_span:
        result2 = operation_2()
    
    # Parent span can access child results
    parent_span.set_attributes({
        "child1.result": result1,
        "child2.result": result2
    })

Context Inheritance

# Spans automatically inherit context from parents
with trace_operation("customer-query") as parent_span:
    parent_span.set_attributes({
        "customer.id": "cust_123",
        "query.type": "support"
    })
    
    # Child spans inherit customer context
    with trace_operation("classify-query") as child_span:
        # This span automatically has customer.id and query.type
        classification = classify_query(query)

📈 Performance Optimization

Minimize Attribute Overhead

# Good: Essential attributes only
span.set_attributes({
    "customer.id": customer_id,
    "query.type": query_type,
    "ai.model": model_name
})
 
# Bad: Too many attributes
span.set_attributes({
    "customer.id": customer_id,
    "customer.name": customer_name,
    "customer.email": customer_email,
    "customer.phone": customer_phone,
    "customer.address": customer_address,
    # ... 50 more attributes
})

Use Conditional Attributes

# Only add attributes when relevant
if customer_tier == "premium":
    span.set_attribute("customer.priority", "high")
    span.set_attribute("ai.model", "gpt-4")
else:
    span.set_attribute("customer.priority", "normal")
    span.set_attribute("ai.model", "gpt-3.5-turbo")

🎯 AI-Specific Best Practices

LLM Span Attributes

with trace_llm(model="gpt-4", provider="openai") as span:
    response = openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": "Hello"}]
    )
    
    # Set usage attributes
    span.set_usage_attributes(
        input_tokens=response.usage.prompt_tokens,
        output_tokens=response.usage.completion_tokens
    )
    
    # Add model-specific attributes
    span.set_attributes({
        "ai.temperature": 0.7,
        "ai.max_tokens": 1000,
        "ai.finish_reason": response.choices[0].finish_reason
    })

Agent Span Context

with trace_agent(agent_type="researcher", agent_id="researcher_001") as span:
    span.set_attributes({
        "agent.capabilities": "web_search,analysis",
        "agent.task": "research_topic",
        "agent.input": topic,
        "agent.context": "customer_support"
    })
    
    result = research_agent.analyze(topic)
    
    span.set_attributes({
        "agent.output": result,
        "agent.confidence": result.confidence,
        "agent.sources_count": len(result.sources)
    })

Tool Execution Spans

with trace_tool(tool_name="web_search", tool_type="api") as span:
    span.set_attributes({
        "tool.input.query": query,
        "tool.input.max_results": 10,
        "tool.input.region": "us-west"
    })
    
    results = web_search_tool.search(query)
    
    span.set_attributes({
        "tool.output.results_count": len(results),
        "tool.output.success": True,
        "tool.output.quality_score": results.quality_score
    })

🔍 Debugging Support

Include Debug Information

span.set_attributes({
    "debug.span_id": span.span_id,
    "debug.trace_id": span.trace_id,
    "debug.timestamp": time.time(),
    "debug.version": "1.2.3"
})

Trace Correlation

# Use consistent correlation IDs
correlation_id = generate_correlation_id()
span.set_attribute("correlation.id", correlation_id)
 
# Pass correlation ID to external services
external_service_call(correlation_id=correlation_id)

🎪 Event Patterns

Start/Complete Pattern

with trace_operation("process-query") as span:
    # Start event
    span.add_event("operation.started", {
        "timestamp": time.time(),
        "input.size": len(query),
        "input.type": "text"
    })
    
    try:
        # Process the query
        result = process_query(query)
        
        # Complete event
        span.add_event("operation.completed", {
            "timestamp": time.time(),
            "output.size": len(result),
            "success": True,
            "duration_ms": time.time() - start_time
        })
        
    except Exception as e:
        # Error event
        span.add_event("operation.failed", {
            "timestamp": time.time(),
            "error.type": type(e).__name__,
            "error.message": str(e),
            "duration_ms": time.time() - start_time
        })
        raise

Retry Pattern

with trace_operation("api-call") as span:
    retry_count = 0
    max_retries = 3
    
    while retry_count <= max_retries:
        try:
            span.add_event("api.call.attempted", {
                "timestamp": time.time(),
                "attempt": retry_count + 1,
                "max_retries": max_retries
            })
            
            result = make_api_call()
            
            span.add_event("api.call.succeeded", {
                "timestamp": time.time(),
                "attempt": retry_count + 1,
                "duration_ms": time.time() - start_time
            })
            
            break
            
        except Exception as e:
            retry_count += 1
            
            span.add_event("api.call.failed", {
                "timestamp": time.time(),
                "attempt": retry_count,
                "error.type": type(e).__name__,
                "error.message": str(e),
                "will_retry": retry_count <= max_retries
            })
            
            if retry_count > max_retries:
                raise

🚀 Next Steps

Now that you understand span best practices, explore these related concepts:

Traces Best Practices - Best practices for complete request journeys
Attributes Best Practices - Best practices for metadata and context
Events Best Practices - Best practices for point-in-time occurrences

Well-structured spans are the building blocks of observability. By following these best practices, you'll create spans that provide clear insights into your operations.