-
Notifications
You must be signed in to change notification settings - Fork 20
Production Deployment
This guide covers best practices and considerations for deploying applications with Prometheus metrics to production environments.
- Metric names follow Prometheus conventions
- Label cardinality is manageable (< 1000 series per metric)
- All metrics have clear help text
- No sensitive data in metric names or labels
- Appropriate metric types chosen
- Metrics collection tested under expected load
- Memory usage is acceptable
- Metrics endpoint responds within 1 second
- No performance degradation in application
- Load testing completed with metrics enabled
- Metrics endpoint not exposed to public internet
- Authentication configured if needed
- Firewall rules configured
- HTTPS enabled for external access
- No sensitive data exposed in metrics
- Prometheus configured to scrape application
- Scrape interval set appropriately
- Dashboards created in Grafana
- Critical alerts configured
- Alert routing configured
For single-instance applications:
# prometheus.yml
scrape_configs:
- job_name: 'my-app'
static_configs:
- targets: ['app.example.com:9090']
scrape_interval: 15sFor applications with multiple instances:
scrape_configs:
- job_name: 'my-app-cluster'
static_configs:
- targets:
- 'app1.example.com:9090'
- 'app2.example.com:9090'
- 'app3.example.com:9090'
labels:
environment: 'production'
cluster: 'main'Behind a load balancer:
scrape_configs:
- job_name: 'my-app'
# Scrape individual instances, not load balancer
static_configs:
- targets:
- 'app-instance1.internal:9090'
- 'app-instance2.internal:9090'Important: Prometheus should scrape individual instances, not the load balancer endpoint.
Using EC2 service discovery:
scrape_configs:
- job_name: 'my-app-aws'
ec2_sd_configs:
- region: us-east-1
port: 9090
relabel_configs:
- source_labels: [__meta_ec2_tag_App]
regex: my-app
action: keepUsing Azure service discovery:
scrape_configs:
- job_name: 'my-app-azure'
azure_sd_configs:
- subscription_id: '<subscription-id>'
resource_group: 'my-resource-group'Service discovery via file:
scrape_configs:
- job_name: 'my-app-k8s'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: my-app
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: truetype
TMetricsConfig = record
Enabled: Boolean;
Port: Integer;
Path: string;
ScrapeInterval: Integer; // seconds
end;
function LoadMetricsConfig: TMetricsConfig;
begin
Result.Enabled := StrToBoolDef(GetEnvironmentVariable('METRICS_ENABLED'), True);
Result.Port := StrToIntDef(GetEnvironmentVariable('METRICS_PORT'), 9090);
Result.Path := GetEnvironmentVariable('METRICS_PATH', '/metrics');
Result.ScrapeInterval := StrToIntDef(GetEnvironmentVariable('SCRAPE_INTERVAL'), 15);
end;procedure RegisterMetrics;
begin
// Always register core metrics
TCounter.Create('requests_total', 'Total requests').Register();
TGauge.Create('memory_bytes', 'Memory usage').Register();
// Optional detailed metrics
if FeatureFlags.IsEnabled('detailed_metrics') then
begin
THistogram.Create('request_duration_seconds', 'Request duration').Register();
THistogram.Create('response_size_bytes', 'Response size').Register();
end;
end;Internal Network Only:
// Check if request is from internal network
function IsInternalIP(const AIP: string): Boolean;
begin
Result := AIP.StartsWith('10.') or
AIP.StartsWith('172.16.') or
AIP.StartsWith('192.168.') or
AIP = '127.0.0.1';
end;
procedure HandleMetricsRequest(ARequest: TRequest; AResponse: TResponse);
begin
if not IsInternalIP(ARequest.RemoteAddr) then
begin
AResponse.StatusCode := 403;
AResponse.Content := 'Forbidden';
Exit;
end;
// Serve metrics...
end;Basic Auth:
procedure HandleMetricsRequest(ARequest: TRequest; AResponse: TResponse);
begin
if not ValidateBasicAuth(ARequest.Headers['Authorization']) then
begin
AResponse.StatusCode := 401;
AResponse.SetHeader('WWW-Authenticate', 'Basic realm="Metrics"');
Exit;
end;
// Serve metrics...
end;Configure Prometheus:
scrape_configs:
- job_name: 'secured-app'
static_configs:
- targets: ['app.example.com:9090']
basic_auth:
username: 'prometheus'
password: 'secret'Use HTTPS for external access:
scrape_configs:
- job_name: 'secure-app'
scheme: https
tls_config:
ca_file: /path/to/ca.crt
cert_file: /path/to/client.crt
key_file: /path/to/client.key
static_configs:
- targets: ['app.example.com:443']Never expose sensitive data:
// Bad - exposes sensitive data
TCounter.Create('user_actions', 'Actions', ['user_email', 'api_key']);
// Good - no sensitive data
TCounter.Create('user_actions', 'Actions', ['user_type', 'action_type']);For high-traffic scenarios:
type
TMetricsCache = class
private
FCache: string;
FLastUpdate: TDateTime;
FCacheDuration: Integer; // seconds
FLock: TCriticalSection;
public
constructor Create(ACacheDuration: Integer = 5);
function GetMetrics: string;
end;
function TMetricsCache.GetMetrics: string;
begin
TMonitor.Enter(FLock);
try
if (Now - FLastUpdate) > (FCacheDuration / SecsPerDay) then
begin
var LExposer := TTextExposer.Create;
try
FCache := LExposer.Render(
TCollectorRegistry.DefaultRegistry.Collect()
);
FLastUpdate := Now;
finally
LExposer.Free;
end;
end;
Result := FCache;
finally
TMonitor.Exit(FLock);
end;
end;Register only necessary metrics:
// Development
if IsDebugMode then
RegisterDebugMetrics();
// Production
if IsProduction then
RegisterCoreMetrics();Monitor and limit cardinality:
procedure CheckCardinality;
var
LSamples: TArray<TMetricSamples>;
LTotalSeries: Integer;
begin
LSamples := TCollectorRegistry.DefaultRegistry.Collect();
LTotalSeries := 0;
for var LMetric in LSamples do
LTotalSeries := LTotalSeries + Length(LMetric.Samples);
if LTotalSeries > 10000 then
LogWarning('High metric cardinality: %d series', [LTotalSeries]);
end;Monitor the metrics system itself:
var
GMetricsCollectionDuration: THistogram;
GMetricsSeriesTotal: TGauge;
GMetricsMemoryBytes: TGauge;
procedure RegisterSelfMetrics;
begin
GMetricsCollectionDuration := THistogram.Create(
'metrics_collection_duration_seconds',
'Time spent collecting metrics'
).Register();
GMetricsSeriesTotal := TGauge.Create(
'metrics_series_total',
'Total number of metric series'
).Register();
GMetricsMemoryBytes := TGauge.Create(
'metrics_memory_bytes',
'Memory used by metrics system'
).Register();
end;
procedure CollectMetrics: string;
var
LStopwatch: TStopwatch;
LSamples: TArray<TMetricSamples>;
LExposer: TTextExposer;
begin
LStopwatch := TStopwatch.StartNew;
try
LSamples := TCollectorRegistry.DefaultRegistry.Collect();
// Update self-metrics
var LSeriesCount := 0;
for var LMetric in LSamples do
LSeriesCount := LSeriesCount + Length(LMetric.Samples);
GMetricsSeriesTotal.SetTo(LSeriesCount);
LExposer := TTextExposer.Create;
try
Result := LExposer.Render(LSamples);
finally
LExposer.Free;
end;
finally
LStopwatch.Stop;
GMetricsCollectionDuration.Observe(LStopwatch.Elapsed.TotalSeconds);
end;
end;Add logging for debugging:
procedure HandleMetricsRequest(ARequest: TRequest; AResponse: TResponse);
begin
LogDebug('Metrics request from: %s', [ARequest.RemoteAddr]);
try
var LMetrics := GetMetrics();
AResponse.Content := LMetrics;
LogDebug('Metrics response: %d bytes', [Length(LMetrics)]);
except
on E: Exception do
begin
LogError('Metrics error: %s', [E.Message]);
raise;
end;
end;
end;Implement health check endpoint:
procedure HandleHealthCheck(ARequest: TRequest; AResponse: TResponse);
var
LHealth: TJSONObject;
begin
LHealth := TJSONObject.Create;
try
LHealth.AddPair('status', 'healthy');
LHealth.AddPair('metrics_enabled', TJSONBool.Create(True));
LHealth.AddPair('metrics_count',
TJSONNumber.Create(GetMetricsCount()));
AResponse.ContentType := 'application/json';
AResponse.Content := LHealth.ToString;
finally
LHealth.Free;
end;
end;// Add debug endpoint (only in non-production)
{$IFNDEF RELEASE}
procedure HandleDebugMetrics(ARequest: TRequest; AResponse: TResponse);
var
LInfo: TStringList;
LSamples: TArray<TMetricSamples>;
begin
LInfo := TStringList.Create;
try
LSamples := TCollectorRegistry.DefaultRegistry.Collect();
LInfo.Add('Total metrics: ' + IntToStr(Length(LSamples)));
for var LMetric in LSamples do
begin
LInfo.Add('');
LInfo.Add('Metric: ' + LMetric.MetricName);
LInfo.Add(' Type: ' + GetEnumName(TypeInfo(TMetricType),
Ord(LMetric.MetricType)));
LInfo.Add(' Samples: ' + IntToStr(Length(LMetric.Samples)));
end;
AResponse.Content := LInfo.Text;
finally
LInfo.Free;
end;
end;
{$ENDIF}Store metrics configuration in version control:
// metrics_config.pas
unit MetricsConfig;
procedure RegisterApplicationMetrics;
begin
// HTTP Metrics
TCounter.Create('http_requests_total',
'Total HTTP requests', ['method', 'status', 'endpoint']).Register();
THistogram.Create('http_request_duration_seconds',
'HTTP request duration', ['method', 'endpoint']).Register();
// Database Metrics
TGauge.Create('db_connections_active',
'Active database connections').Register();
// Application Metrics
TGauge.Create('app_memory_bytes',
'Application memory usage').Register();
end;Each instance exposes its own metrics:
scrape_configs:
- job_name: 'app-cluster'
static_configs:
- targets: ['app1:9090', 'app2:9090', 'app3:9090']
# Prometheus aggregates across instancesDeploy Prometheus in HA mode:
# prometheus-1.yml
global:
external_labels:
replica: prometheus-1
# prometheus-2.yml
global:
external_labels:
replica: prometheus-2groups:
- name: critical-alerts
rules:
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
> 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate in production"
- alert: ApplicationDown
expr: up{job="my-app"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Application instance is down"Document for operations team:
# Metrics Operations Guide
## Endpoints
- Metrics: http://app:9090/metrics
- Health: http://app:9090/health
## Common Issues
1. Metrics endpoint slow
- Check cardinality: curl app:9090/metrics | wc -l
- Review recent label changes
2. Missing metrics
- Verify Prometheus can reach endpoint
- Check Prometheus targets page
- Verify metrics are registered
## Emergency Contacts
- On-call: ...
- Team lead: ...- Performance Considerations - Optimization tips
- Best Practices - Usage guidelines
- Prometheus and Grafana - Server setup
- FAQ and Troubleshooting - Common issues