2021-03-01 11:43:07.689 INFO [boot-otel-tempo-provider1,24b65cc591d3a69da10f308ab41343d7,6db7a6a82703fee9] 8 --- [nio-8090-exec-5] i.o.example.flight.FlightService : Doing some work In New span
TraceId is the second field inside [and ] in this case it is 24b65cc591d3a69da10f308ab41343d7
To Integrate Loki with Tempo in Grafana, we would need the following regular expression
\[.+,(.+),.+\]
Above would add Tempo links as follows
Clicking on the button would display the following
that is, it would split the screen and display the trace information.
# Global settings and defaults.
global:
# Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from
# timing out first.
scrape_timeout_offset: 500ms
# Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
min_interval: 0s
# Maximum number of open connections to any one target. Metric queries will run concurrently on
# multiple connections.
max_connections: 1
# Maximum number of idle connections to any one target.
max_idle_connections: 1
# The target to monitor and the list of collectors to execute on it.
#target:
# Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
# the schema gets dropped or replaced to match the driver expected DSN format.
#data_source_name: 'sqlserver://prom_user:[email protected]:1433'
# Collectors (referenced by name) to execute on the target.
#collectors: [pricing_data_freshness]
# Jobs are equivalent to jobs in the Prometheus configuration: they group similar targets with similar metrics together.
jobs:
# All metrics from all targets get a `job` label, set to this value.
- job_name: app_one_mssql
# The set of collectors (defined below) to be applied to all targets in this job.
collectors: [app_one]
# Similar to the Prometheus configuration, multiple sets of targets may be defined, each with an optional set of
# labels to be applied to all metrics.
static_configs:
- targets:
'sqlserver': 'sqlserver://<USER>:<PWD>@<SERVER>:1433?database=<DB>'
labels:
type: sql_exporter
# Collector definition files.
collector_files:
- "*.collector.yaml"
Converting sql-exporter.yml to base64encoded
openssl base64 -A -in sql-exporter.yml -out sql-exporter-base64encoded.txt
Opentelemetry is still in early stages when it comes to metrics export, hence we would be using promeheus, and spring boot support for prometheus for metrics
There is no parallel to Grafana Loki for distributed logging.
OpenTelemetry API, including all TypeScript interfaces, enums, and no-op implementations. It is intended for use both on the server and in the browser.
This package provides default implementations of the OpenTelemetry API for trace and metrics. It’s intended for use both on the server and in the browser.
Used standalone, this module provides methods for manual instrumentation of code, offering full control over span creation for client-side JavaScript (browser) and Node.js. It does not provide automated instrumentation of known libraries, context propagation for asynchronous invocations or distributed-context out-of-the-box. Contains processors and exporters
The following would enable automatic tracing for express, http/https, aws and mssql
import log4js from 'log4js';
import opentelemetry, { context, getSpan, getSpanContext } from '@opentelemetry/api';
import {NodeTracerProvider} from '@opentelemetry/node'
import {registerInstrumentations} from '@opentelemetry/instrumentation'
import {JaegerExporter} from '@opentelemetry/exporter-jaeger'
import {SimpleSpanProcessor, BatchSpanProcessor, ConsoleSpanExporter} from '@opentelemetry/tracing'
const logger = log4js.getLogger("tracing");
logger.level = "debug";
// Enable OpenTelemetry exporters to export traces to Grafan Tempo.
const provider = new NodeTracerProvider ({
plugins: {
express: {
enabled: true,
path: '@opentelemetry/plugin-express',
},
http: {
enabled: true,
path: '@opentelemetry/plugin-http',
},
'aws-sdk': {
enabled: true,
// You may use a package name or absolute path to the file.
path: "opentelemetry-plugin-aws-sdk",
},
mssql: {
enabled: true,
// You may use a package name or absolute path to the file.
path: "opentelemetry-plugin-mssql",
},
},
});
// register and load instrumentation and old plugins - old plugins will be loaded automatically as previously
// but instrumentations needs to be added
registerInstrumentations({
tracerProvider: provider
});
// Initialize the exporter.
const options = {
serviceName: process.env.OTEL_SERVICE_NAME,
tags: [], // optional
// You can use the default UDPSender
//host: 'localhost', // optional
//port: 6832, // optional
// OR you can use the HTTPSender as follows
//14250 : model.proto not working
endpoint: process.env.OTEL_EXPORTER_JAEGER_ENDPOINT,
maxPacketSize: 65000 // optional
}
/**
*
* Configure the span processor to send spans to the exporter
* The SimpleSpanProcessor does no batching and exports spans
* immediately when they end. For most production use cases,
* OpenTelemetry recommends use of the BatchSpanProcessor.
*/
provider.addSpanProcessor(new BatchSpanProcessor(new JaegerExporter(options)));
//provider.addSpanProcessor(new SimpleSpanProcessor(new ConsoleSpanExporter()));
/**
* Registering the provider with the API allows it to be discovered
* and used by instrumentation libraries. The OpenTelemetry API provides
* methods to set global SDK implementations, but the default SDK provides
* a convenience method named `register` which registers same defaults
* for you.
*
* By default the NodeTracerProvider uses Trace Context for propagation
* and AsyncHooksScopeManager for context management. To learn about
* customizing this behavior, see API Registration Options below.
*/
// Initialize the OpenTelemetry APIs to use the NodeTracerProvider bindings
provider.register();
export const tracer = opentelemetry.trace.getTracer(process.env.OTEL_SERVICE_NAME);
export const addTraceId = (req, res, next) => {
const spanContext = getSpanContext(context.active());
req.traceId = spanContext && spanContext.traceId;
next();
};
logger.debug("tracing initialized for %s sending span to %s", options.serviceName, options.endpoint);
We will use flag library to parse command line flags
Exporter
There is two types to data exposed by exporters to prometheus
First one is metrics definition (name, definition and type) and the second one is Metric Value
If we analyze Prometheus Collector, this is what is expected as well, when ever prometheus calls the metrics endpoint, the following two methods would be invoked. First one basically describes the metrics while the other one collects the metrics values.
// Collector is the interface implemented by anything that can be used by
// Prometheus to collect metrics. A Collector has to be registered for
// collection. See Registerer.Register.
//
// The stock metrics provided by this package (Gauge, Counter, Summary,
// Histogram, Untyped) are also Collectors (which only ever collect one metric,
// namely itself). An implementer of Collector may, however, collect multiple
// metrics in a coordinated fashion and/or create metrics on the fly. Examples
// for collectors already implemented in this library are the metric vectors
// (i.e. collection of multiple instances of the same Metric but with different
// label values) like GaugeVec or SummaryVec, and the ExpvarCollector.
type Collector interface {
// Describe sends the super-set of all possible descriptors of metrics
// collected by this Collector to the provided channel and returns once
// the last descriptor has been sent. The sent descriptors fulfill the
// consistency and uniqueness requirements described in the Desc
// documentation.
//
// It is valid if one and the same Collector sends duplicate
// descriptors. Those duplicates are simply ignored. However, two
// different Collectors must not send duplicate descriptors.
//
// Sending no descriptor at all marks the Collector as “unchecked”,
// i.e. no checks will be performed at registration time, and the
// Collector may yield any Metric it sees fit in its Collect method.
//
// This method idempotently sends the same descriptors throughout the
// lifetime of the Collector. It may be called concurrently and
// therefore must be implemented in a concurrency safe way.
//
// If a Collector encounters an error while executing this method, it
// must send an invalid descriptor (created with NewInvalidDesc) to
// signal the error to the registry.
Describe(chan<- *Desc)
// Collect is called by the Prometheus registry when collecting
// metrics. The implementation sends each collected metric via the
// provided channel and returns once the last metric has been sent. The
// descriptor of each sent metric is one of those returned by Describe
// (unless the Collector is unchecked, see above). Returned metrics that
// share the same descriptor must differ in their variable label
// values.
//
// This method may be called concurrently and must therefore be
// implemented in a concurrency safe way. Blocking occurs at the expense
// of total performance of rendering all registered metrics. Ideally,
// Collector implementations support concurrent readers.
Collect(chan<- Metric)
}
Lets first create exporter package and volume_exporter.go file
If we have more than one metrics to expose, it is always better to group them by Structure.
Lets define our volumeCollector Struct having descriptors
//Define a struct for you collector that contains pointers
//to prometheus descriptors for each metric you wish to expose.
//Note you can also include fields of other types if they provide utility
//but we just won't be exposing them as metrics.
type volumeCollector struct {
volumeBytesTotal *prometheus.Desc
volumeBytesFree *prometheus.Desc
volumeBytesUsed *prometheus.Desc
}
Lets define the factory method that returns the structure
//You must create a constructor for you collector that
//initializes every descriptor and returns a pointer to the collector
func newVolumeCollector() *volumeCollector {
return &volumeCollector{
volumeBytesTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "bytes_total"),
"Total size of the volume/disk",
[]string{"name", "path"}, nil,
),
volumeBytesFree: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "bytes_free"),
"Free size of the volume/disk",
[]string{"name", "path"}, nil,
),
volumeBytesUsed: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "bytes_used"),
"Used size of volume/disk",
[]string{"name", "path"}, nil,
),
}
}
Implement Describe method on Exporter
//Each and every collector must implement the Describe function.
//It essentially writes all descriptors to the prometheus desc channel.
func (collector *volumeCollector) Describe(ch chan<- *prometheus.Desc) {
//Update this section with the each metric you create for a given collector
ch <- collector.volumeBytesTotal
ch <- collector.volumeBytesFree
ch <- collector.volumeBytesUsed
}
Implement Collect method on Exporter
//Collect implements required collect function for all promehteus collectors
func (collector *volumeCollector) Collect(ch chan<- prometheus.Metric) {
//Implement logic here to determine proper metric value to return to prometheus
//for each descriptor or call other functions that do so.
var metricValue float64
if 1 == 1 {
metricValue = 1
}
//Write latest value for each metric in the prometheus metric channel.
//Note that you can pass CounterValue, GaugeValue, or UntypedValue types here.
ch <- prometheus.MustNewConstMetric(collector.volumeBytesTotal, prometheus.GaugeValue, metricValue, "log", "path")
ch <- prometheus.MustNewConstMetric(collector.volumeBytesFree, prometheus.GaugeValue, metricValue, "log", "path")
ch <- prometheus.MustNewConstMetric(collector.volumeBytesUsed, prometheus.GaugeValue, metricValue, "log", "path")
}
Lets define a method so that other packages can talk to exporter
The following would help capture Container prometheus metrics
Add the following to your prometheus.yaml
# Example scrape config for pods
#
# The relabeling allows the actual pod scrape endpoint to be configured via the
# following annotations:
#
# * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this. This will be the same for every container in the pod that is scraped.
# * this will scrape every container in a pod with `prometheus.io/scrape` set to true and the port is name `metrics` in the container
# * note `prometheus.io/port` is no longer honored. You must name the port(s) to scrape `metrics`
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- ${NAME_SPACE}
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_container_port_name]
action: keep
regex: metrics(-.*)?
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [ __address__, __meta_kubernetes_pod_container_port_number]
action: replace
regex: (.+):(?:\d+);(\d+)
replacement: ${1}:${2}
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
Add the following to your pom (Spring boot version should be 2.3.4.RELEASE and above)
# Add this at the top
server.port=9090
# JMX
management.endpoints.web.exposure.include=*
management.endpoints.web.exposure.include=prometheus,health,info,metric
management.health.probes.enabled=true
management.endpoint.health.show-details=always
Add the following to your spec.containers:
ports:
- name: metrics
containerPort: 9090
Note: port name should start with metrics
Add the following annotation to DeploymentConfig.spec.template.metadata:
To avoid restart cycles, set the livenessProbe.initialDelaySeconds parameter to be safely longer than it takes your service to initialize. You can then use a shorter value for the readinessProbe.initialDelaySeconds attribute to route requests to the service as soon as it’s ready.
9090 is the actuator port (server.port=9090)
initialDelaySeconds – After creating the container, wait n seconds before initiating the probe
periodSeconds – How often this probe should be run, defaulting to 10 seconds; the minimum is 1 second
timeoutSeconds – How long we wait before timing out the probe, defaulting to 1 second; the minimum is again 1 second
failureThreshold – Try n times before giving up. In the case of readiness, our pod will be marked as not ready, whereas giving up in case of liveness means restarting the Pod. The default here is 3 failures, with the minimum being 1
successThreshold – This is the minimum number of consecutive successes for the probe to be considered successful after having failed. It defaults to 1 success and its minimum is 1 as well
apiVersion: v1
kind: Template
metadata:
name: grafana
annotations:
"openshift.io/display-name": Grafana
description: |
A Visualization solution for Prometheus
iconClass: fa fa-cogs
tags: "dashboard, grafana"
parameters:
- name: APP_NAME
description: "Value for app label."
- name: NAME_SPACE
description: "The name of the namespace (openshift project)"
- name: IMAGE_GRAFANA
description: Grafana Docker image
required: true
value: "grafana/grafana:7.3.6"
- name: VOLUME_CAPACITY
displayName: Volume Capacity
description: Volume space available for data, e.g. 512Mi, 2Gi.
value: 20Gi
required: true
objects:
- apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-lib-pvc
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: "${VOLUME_CAPACITY}"
- apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-log-pvc
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: "${VOLUME_CAPACITY}"
- apiVersion: apps.openshift.io/v1
kind: DeploymentConfig
metadata:
labels:
app: ${APP_NAME}
name: ${APP_NAME}
namespace: "${NAME_SPACE}"
spec:
replicas: 1
selector:
app: ${APP_NAME}
template:
metadata:
labels:
app: ${APP_NAME}
name: grafana
spec:
containers:
- name: grafana
command:
- sh
args:
- -c
- /etc/grafana/grafana-prepare.sh ; exec /run.sh
image: ${IMAGE_GRAFANA}
imagePullPolicy: Always
ports:
- containerPort: 3000
name: grafana-http
protocol: TCP
livenessProbe:
httpGet:
path: /api/health
port: grafana-http
scheme: HTTP
initialDelaySeconds: 120
periodSeconds: 10
successThreshold: 1
failureThreshold: 3
timeoutSeconds: 1
readinessProbe:
failureThreshold: 3
httpGet:
path: /api/health
port: grafana-http
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
env:
- name: GF_AUTH_LDAP_ENABLED
value: 'true'
- name: GF_AUTH_LDAP_CONFIG_FILE
value: /var/lib/grafana/ldap.toml
- name: GF_INSTALL_PLUGINS
value: 'grafana-clock-panel'
- name: GRAFANA_BIND_PWD
valueFrom:
secretKeyRef:
name: app-secret
key: GRAFANA_BIND_PWD
resources:
limits:
cpu: "500m"
memory: "128Mi"
requests:
cpu: "250m"
memory: "64Mi"
volumeMounts:
- mountPath: /etc/grafana
name: grafana-etc-volume
- mountPath: /etc/grafana/provisioning/datasources
name: grafana-datasources-volume
- mountPath: /etc/grafana/provisioning/dashboards
name: grafana-dashboard-config-volume
- mountPath: /var/lib/grafana-dashboards
name: grafana-dashboards
- mountPath: /var/lib/grafana
name: grafana-lib-volume
- mountPath: /var/log/grafana
name: grafana-log-volume
volumes:
- name: grafana-datasources-volume
configMap:
defaultMode: 420
name: grafana-datasources
- name: grafana-dashboard-config-volume
configMap:
defaultMode: 420
name: grafana-dashboard-config
- name: grafana-etc-volume
configMap:
defaultMode: 0777
name: grafana
- name: grafana-dashboards
configMap:
defaultMode: 420
name: grafana-dashboards
- name: grafana-lib-volume
persistentVolumeClaim:
claimName: grafana-lib-pvc
- name: grafana-log-volume
persistentVolumeClaim:
claimName: grafana-log-pvc
- apiVersion: v1
kind: Service
name: grafana
metadata:
labels:
app: ${APP_NAME}
name: grafana
namespace: "${NAME_SPACE}"
spec:
ports:
- name: grafana
port: 3000
protocol: TCP
targetPort: grafana-http
selector:
app: grafana
- apiVersion: route.openshift.io/v1
kind: Route
metadata:
labels:
app: ${APP_NAME}
name: grafana
namespace: "${NAME_SPACE}"
spec:
to:
kind: Service
name: grafana
weight: 100
tls:
termination: edge
wildcardPolicy: None
- apiVersion: v1
kind: ConfigMap
metadata:
name: grafana
data:
grafana-prepare.sh: |
#!/bin/sh
set -eu
sed "s|<GRAFANA_BIND_PWD>|$GRAFANA_BIND_PWD|g" /etc/grafana/ldap_pl.toml > /var/lib/grafana/ldap.toml
grafana.ini: |
##################### Grafana Configuration Defaults #####################
#
# Do not modify this file in grafana installs
#
# possible values : production, development
app_mode = production
# instance name, defaults to HOSTNAME environment variable value or hostname if HOSTNAME var is empty
instance_name = ${HOSTNAME}
#################################### Paths ###############################
[paths]
# Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used)
#
data = data
#
# Directory where grafana can store logs
#
logs = data/log
#
# Directory where grafana will automatically scan and look for plugins
#
plugins = data/plugins
# Directory where grafana will look for provisioning files (Data Sources, Dashboards)
provisioning = provisioning
#################################### Server ##############################
[server]
# Protocol (http, https, socket)
protocol = http
# The ip address to bind to, empty will bind to all interfaces
http_addr =
# The http port to use
http_port = 3000
# The public facing domain name used to access grafana from a browser
domain = localhost
# Redirect to correct domain if host header does not match domain
# Prevents DNS rebinding attacks
enforce_domain = false
# The full public facing url
root_url = %(protocol)s://%(domain)s:%(http_port)s/
# Log web requests
router_logging = false
# the path relative working path
static_root_path = public
# enable gzip
enable_gzip = false
# https certs & key file
cert_file =
cert_key =
# Unix socket path
socket = /tmp/grafana.sock
#################################### Database ############################
[database]
# You can configure the database connection by specifying type, host, name, user and password
# as separate properties or as on string using the url property.
# Either "mysql", "postgres" or "sqlite3", it's your choice
type = sqlite3
host = 127.0.0.1:3306
name = grafana
user = root
# If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;"""
password =
# Use either URL or the previous fields to configure the database
# Example: mysql://user:secret@host:port/database
url =
# Max conn setting default is 0 (mean not set)
max_idle_conn =
max_open_conn =
# For "postgres", use either "disable", "require" or "verify-full"
# For "mysql", use either "true", "false", or "skip-verify".
ssl_mode = disable
ca_cert_path =
client_key_path =
client_cert_path =
server_cert_name =
# For "sqlite3" only, path relative to data_path setting
path = grafana.db
#################################### Session #############################
[session]
# Either "memory", "file", "redis", "mysql", "postgres", "memcache", default is "file"
provider = file
# Provider config options
# memory: not have any config yet
# file: session dir path, is relative to grafana data_path
# redis: config like redis server e.g. `addr=127.0.0.1:6379,pool_size=100,db=grafana`
# postgres: user=a password=b host=localhost port=5432 dbname=c sslmode=disable
# mysql: go-sql-driver/mysql dsn config string, examples:
# `user:password@tcp(127.0.0.1:3306)/database_name`
# `user:password@unix(/var/run/mysqld/mysqld.sock)/database_name`
# memcache: 127.0.0.1:11211
provider_config = sessions
# Session cookie name
cookie_name = grafana_sess
# If you use session in https only, default is false
cookie_secure = false
# Session life time, default is 86400
session_life_time = 86400
gc_interval_time = 86400
#################################### Data proxy ###########################
[dataproxy]
# This enables data proxy logging, default is false
logging = false
#################################### Analytics ###########################
[analytics]
# Server reporting, sends usage counters to stats.grafana.org every 24 hours.
# No ip addresses are being tracked, only simple counters to track
# running instances, dashboard and error counts. It is very helpful to us.
# Change this option to false to disable reporting.
reporting_enabled = true
# Set to false to disable all checks to https://grafana.com
# for new versions (grafana itself and plugins), check is used
# in some UI views to notify that grafana or plugin update exists
# This option does not cause any auto updates, nor send any information
# only a GET request to https://grafana.com to get latest versions
check_for_updates = true
# Google Analytics universal tracking code, only enabled if you specify an id here
google_analytics_ua_id =
# Google Tag Manager ID, only enabled if you specify an id here
google_tag_manager_id =
#################################### Security ############################
[security]
# default admin user, created on startup
admin_user = admin
# default admin password, can be changed before first start of grafana, or in profile settings
admin_password = admin
# used for signing
secret_key = SW2YcwTIb9zpOOhoPsMm
# Auto-login remember days
login_remember_days = 7
cookie_username = grafana_user
cookie_remember_name = grafana_remember
# disable gravatar profile images
disable_gravatar = false
# data source proxy whitelist (ip_or_domain:port separated by spaces)
data_source_proxy_whitelist =
[snapshots]
# snapshot sharing options
external_enabled = true
external_snapshot_url = https://snapshots-origin.raintank.io
external_snapshot_name = Publish to snapshot.raintank.io
# remove expired snapshot
snapshot_remove_expired = true
# remove snapshots after 90 days
snapshot_TTL_days = 90
#################################### Users ####################################
[users]
# disable user signup / registration
allow_sign_up = false
# Allow non admin users to create organizations
allow_org_create = false
# Set to true to automatically assign new users to the default organization (id 1)
auto_assign_org = true
# Default role new users will be automatically assigned (if auto_assign_org above is set to true)
auto_assign_org_role = Viewer
# Require email validation before sign up completes
verify_email_enabled = false
# Background text for the user field on the login page
login_hint = email or username
# Default UI theme ("dark" or "light")
default_theme = dark
[auth]
# Set to true to disable (hide) the login form, useful if you use OAuth
disable_login_form = false
# Set to true to disable the signout link in the side menu. useful if you use auth.proxy
disable_signout_menu = false
#################################### Anonymous Auth ######################
[auth.anonymous]
# enable anonymous access
enabled = false
# specify organization name that should be used for unauthenticated users
org_name = Main Org.
# specify role for unauthenticated users
org_role = Viewer
#################################### Github Auth #########################
[auth.github]
enabled = false
allow_sign_up = true
client_id = some_id
client_secret = some_secret
scopes = user:email
auth_url = https://github.com/login/oauth/authorize
token_url = https://github.com/login/oauth/access_token
api_url = https://api.github.com/user
team_ids =
allowed_organizations =
#################################### Google Auth #########################
[auth.google]
enabled = false
allow_sign_up = true
client_id = some_client_id
client_secret = some_client_secret
scopes = https://www.googleapis.com/auth/userinfo.profile https://www.googleapis.com/auth/userinfo.email
auth_url = https://accounts.google.com/o/oauth2/auth
token_url = https://accounts.google.com/o/oauth2/token
api_url = https://www.googleapis.com/oauth2/v1/userinfo
allowed_domains =
hosted_domain =
#################################### Grafana.com Auth ####################
# legacy key names (so they work in env variables)
[auth.grafananet]
enabled = false
allow_sign_up = true
client_id = some_id
client_secret = some_secret
scopes = user:email
allowed_organizations =
[auth.grafana_com]
enabled = false
allow_sign_up = true
client_id = some_id
client_secret = some_secret
scopes = user:email
allowed_organizations =
#################################### Generic OAuth #######################
[auth.generic_oauth]
name = OAuth
enabled = false
allow_sign_up = true
client_id = some_id
client_secret = some_secret
scopes = user:email
auth_url =
token_url =
api_url =
team_ids =
allowed_organizations =
#################################### Basic Auth ##########################
[auth.basic]
enabled = true
#################################### Auth Proxy ##########################
[auth.proxy]
enabled = false
header_name = X-WEBAUTH-USER
header_property = username
auto_sign_up = true
ldap_sync_ttl = 60
whitelist =
#################################### Auth LDAP ###########################
[auth.ldap]
enabled = true
config_file = /var/lib/grafana/ldap.toml
# Allow sign up should almost always be true (default) to allow new Grafana users to be created (if LDAP authentication is ok). If set to
# false only pre-existing Grafana users will be able to login (if LDAP authentication is ok).
allow_sign_up = true
#################################### SMTP / Emailing #####################
[smtp]
enabled = true
host = smtp-host.org.com:25 ## CHANGE THIS
user =
# If the password contains # or ; you have to wrap it with trippel quotes. Ex """#password;"""
password =
cert_file =
key_file =
skip_verify = true
from_address = [email protected]
from_name = Grafana
[emails]
welcome_email_on_sign_up = false
templates_pattern = emails/*.html
#################################### Logging ##########################
[log]
# Either "console", "file", "syslog". Default is console and file
# Use space to separate multiple modes, e.g. "console file"
mode = console file
# Either "debug", "info", "warn", "error", "critical", default is "info"
level = info
# optional settings to set different levels for specific loggers. Ex filters = sqlstore:debug
filters =
# For "console" mode only
[log.console]
level =
# log line format, valid options are text, console and json
format = console
# For "file" mode only
[log.file]
level =
# log line format, valid options are text, console and json
format = text
# This enables automated log rotate(switch of following options), default is true
log_rotate = true
# Max line number of single file, default is 1000000
max_lines = 1000000
# Max size shift of single file, default is 28 means 1 << 28, 256MB
max_size_shift = 28
# Segment log daily, default is true
daily_rotate = true
# Expired days of log file(delete after max days), default is 7
max_days = 7
[log.syslog]
level =
# log line format, valid options are text, console and json
format = text
# Syslog network type and address. This can be udp, tcp, or unix. If left blank, the default unix endpoints will be used.
network =
address =
# Syslog facility. user, daemon and local0 through local7 are valid.
facility =
# Syslog tag. By default, the process' argv[0] is used.
tag =
#################################### AMQP Event Publisher ################
[event_publisher]
enabled = false
rabbitmq_url = amqp://localhost/
exchange = grafana_events
#################################### Dashboard JSON files ################
[dashboards.json]
enabled = false
path = /var/lib/grafana/dashboards
#################################### Usage Quotas ########################
[quota]
enabled = false
#### set quotas to -1 to make unlimited. ####
# limit number of users per Org.
org_user = 10
# limit number of dashboards per Org.
org_dashboard = 100
# limit number of data_sources per Org.
org_data_source = 10
# limit number of api_keys per Org.
org_api_key = 10
# limit number of orgs a user can create.
user_org = 10
# Global limit of users.
global_user = -1
# global limit of orgs.
global_org = -1
# global limit of dashboards
global_dashboard = -1
# global limit of api_keys
global_api_key = -1
# global limit on number of logged in users.
global_session = -1
#################################### Alerting ############################
[alerting]
# Disable alerting engine & UI features
enabled = true
# Makes it possible to turn off alert rule execution but alerting UI is visible
execute_alerts = true
#################################### Internal Grafana Metrics ############
# Metrics available at HTTP API Url /api/metrics
[metrics]
enabled = true
interval_seconds = 10
# Send internal Grafana metrics to graphite
[metrics.graphite]
# Enable by setting the address setting (ex localhost:2003)
address =
prefix = prod.grafana.%(instance_name)s.
[grafana_net]
url = https://grafana.com
[grafana_com]
url = https://grafana.com
#################################### External Image Storage ##############
[external_image_storage]
# You can choose between (s3, webdav)
provider =
[external_image_storage.s3]
bucket_url =
access_key =
secret_key =
[external_image_storage.webdav]
url =
username =
password =
public_url =
ldap_pl.toml: |
# https://grafana.com/docs/grafana/latest/auth/ldap/
[[servers]]
# Ldap server host (specify multiple hosts space separated)
host = "ad-ldap.org.com" ### CHANGE THIS
# Default port is 389 or 636 if use_ssl = true
port = 389
# Set to true if LDAP server supports TLS
use_ssl = false
# Set to true if connect LDAP server with STARTTLS pattern (create connection in insecure, then upgrade to secure connection with TLS)
start_tls = false
# set to true if you want to skip SSL cert validation
ssl_skip_verify = false
# set to the path to your root CA certificate or leave unset to use system defaults
# root_ca_cert = "/path/to/certificate.crt"
# Authentication against LDAP servers requiring client certificates
# client_cert = "/path/to/client.crt"
# client_key = "/path/to/client.key"
# Search user bind dn
bind_dn = "cn=app_grafana,cn=users,dc=ms,dc=ds,dc=org,dc=com" ## change app_grafana to what ever valid service user
# Search user bind password
# If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;"""
bind_password = '<GRAFANA_BIND_PWD>'
# User search filter, for example "(cn=%s)" or "(sAMAccountName=%s)" or "(uid=%s)"
# Allow login from email or username, example "(|(sAMAccountName=%s)(userPrincipalName=%s))"
# search_filter = "(&(objectClass=user)(objectClass=top)(cn=%s))"
search_filter = "(cn=%s)"
# An array of base dns to search through
search_base_dns = ["dc=ms,dc=ds,dc=org,dc=com"]
# group_search_filter = "(&(objectClass=posixGroup)(memberUid=%s))"
# group_search_filter_user_attribute = "distinguishedName"
# group_search_base_dns = ["ou=groups,dc=grafana,dc=org"]
# Specify names of the LDAP attributes your LDAP uses
[servers.attributes]
name = "givenName"
surname = "sn"
username = "cn"
member_of = "memberOf"
email = "mail"
[[servers.group_mappings]]
group_dn = "CN=APP_infra_admin,CN=Users,DC=ms,DC=ds,DC=org,DC=com" ## CHANGE APP_infra_admin to valid group name
org_role = "Admin"
# The Grafana organization database id, optional, if left out the default org (id 1) will be used
# org_id = 1
[[servers.group_mappings]]
#group_dn = "cn=users,dc=grafana,dc=org"
#org_role = "Editor"
[[servers.group_mappings]]
# If you want to match all (or no ldap groups) then you can use wildcard
group_dn = "*"
org_role = "Viewer"
- apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-datasources
data:
prometheus.yml: |
# config file version
apiVersion: 1
# list of datasources to insert/update depending
# whats available in the database
datasources:
# <string, required> name of the datasource. Required
- name: DS-Prometheus
# <string, required> datasource type. Required
type: prometheus
# <string, required> access mode. direct or proxy. Required
access: proxy
# <int> org id. will default to orgId 1 if not specified
orgId: 1
# <string> url
url: http://prometheus:9090
version: 1
# <bool> mark as default datasource. Max one per org
isDefault: true
# <bool> allow users to edit datasources from the UI.
editable: true
- apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-config
data:
openshift-metrics-dashboard.yaml: |
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
editable: false
options:
path: /var/lib/grafana-dashboards
- apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards
data:
openshift-metrics-dashboard.json: |
{
}
oc process -f grafana-template.yaml | oc apply -f -
apiVersion: v1
kind: Template
metadata:
name: alertmanager
annotations:
"openshift.io/display-name": Prometheus - Alertmanager
description: |
A monitoring solution for an OpenShift cluster - collect and gather metrics and alerts from nodes, services, and the infrastructure. This is a tech preview feature.
iconClass: fa fa-cogs
tags: "monitoring, prometheus, alertmanager, time-series"
parameters:
- name: APP_NAME
description: "Value for app label."
- name: NAME_SPACE
description: "The name of the namespace (openshift project)"
- name: REPLICAS
description: "number of Alertmanager replicas"
objects:
- apiVersion: v1
kind: ServiceAccount
metadata:
name: alertmanager
- apiVersion: v1
kind: Service
name: alertmanager
metadata:
labels:
app: ${APP_NAME}
name: alertmanager
namespace: "${NAME_SPACE}"
spec:
ports:
- name: alertmanager
port: 9093
protocol: TCP
targetPort: alert-port
selector:
app: ${APP_NAME}
- apiVersion: v1
kind: Route
metadata:
annotations:
labels:
app: ${APP_NAME}
name: alertmanager
namespace: "${NAME_SPACE}"
spec:
port:
targetPort: alertmanager
to:
kind: Service
name: alertmanager
weight: 100
tls:
termination: edge
wildcardPolicy: None
- apiVersion: apps/v1beta1
kind: StatefulSet
metadata:
name: alertmanager
spec:
podManagementPolicy: Parallel
updateStrategy:
type: RollingUpdate
selector:
matchLabels:
app: alertmanager
replicas: ${REPLICAS}
template:
metadata:
labels:
app: ${APP_NAME}
annotations:
prometheus.io/path: /metrics
prometheus.io/port: "9093"
prometheus.io/scheme: http
prometheus.io/scrape: "true"
spec:
serviceAccountName: alertmanager
containers:
- name: alertmanager
args:
- --storage.path=/alertmanager/data/
- --config.file=/etc/alertmanager/alertmanager.yml
- --web.external-url=https://alertmanager-demo-org.com
image: prom/alertmanager:v0.21.0
resources:
limits:
cpu: "500m"
memory: "128Mi"
requests:
cpu: "250m"
memory: "64Mi"
ports:
- name: alert-port
containerPort: 9093
- name: cluster-port
containerPort: 9094
readinessProbe:
failureThreshold: 3
httpGet:
path: /-/ready
port: 9093
scheme: HTTP
initialDelaySeconds: 60
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
livenessProbe:
failureThreshold: 3
httpGet:
path: /-/healthy
port: 9093
scheme: HTTP
initialDelaySeconds: 60
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
volumeMounts:
- name: alertmanager-data
mountPath: /alertmanager/data/
- name: alertmanager-config-dir
mountPath: /etc/alertmanager
- name: configmap-reload
image: jimmidyson/configmap-reload:v0.4.0
imagePullPolicy: "IfNotPresent"
args:
- --volume-dir=/etc/alertmanager
- --webhook-url=http://localhost:9093/-/reload
volumeMounts:
- name: alertmanager-config-dir
mountPath: /etc/alertmanager
readOnly: true
volumes:
- name: alertmanager-config-dir
configMap:
defaultMode: 420
items:
- key: alertYaml
path: alertmanager.yml
name: alertmanager-config-map
volumeClaimTemplates:
- metadata:
name: alertmanager-data
spec:
accessModes: [ "ReadWriteMany" ]
resources:
requests:
storage: 2Gi
- apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-config-map
data:
alertYaml: |-
---
# Alerting rules
#
# Required labels:
# alertname
# severity (critical, warning, information)
# service (prometheus, okd, rabbit, redis, kafka, application)
# scope (monitoring, infrastructure, messaging, db)
# Optional Labels:
# target (downstream)
# environment (stage, production)
global:
# The smarthost and SMTP sender used for mail notifications.
smtp_smarthost: 'mailo2.org.com:25'
smtp_from: '[email protected]'
smtp_require_tls: False
# The root route on which each incoming alert enters.
route:
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
group_by: ['alertname', 'severity', 'scope']
# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s
# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m
# If an alert has successfully been sent, wait 'repeat_interval' to
# resend them.
repeat_interval: 4h
# The root route must not have any matchers as it is the entry point for
# all alerts. It needs to have a receiver configured so alerts that do not
# match any of the sub-routes are sent to someone.
# severity page - will only send email to apps team
# severity alert - will send email and pager duty notifications to apps team
# severity notification - will send email notification to pep team
# severity warning - will send email and pager duty notification to pep team
receiver: 'catch-all-email-receiver'
routes:
- match:
environment: stage
repeat_interval: 8h
receiver: 'non-prod-email-receiver'
continue: false
# Literally anything with the word 'down'
- match_re:
alertname: ^(Down|down)$
repeat_interval: 2h
receiver: 'infra-email-receiver'
continue: true # Whether an alert should continue matching subsequent sibling nodes. default is false
receivers:
- name: 'non-prod-email-receiver'
email_configs:
- to: '[email protected]'
from: '[email protected]'
send_resolved: true
- name: 'critical-email-receiver'
email_configs:
- to: '[email protected]'
from: '[email protected]'
send_resolved: true # Whether or not to notify about resolved alerts. default is false
- name: 'infra-email-receiver'
email_configs:
- to: '[email protected]'
from: '[email protected]'
send_resolved: true
- name: 'catch-all-email-receiver'
email_configs:
- to: '[email protected]'
send_resolved: true
DeploymentConfig
apiVersion: v1
kind: Template
metadata:
name: alertmanager
annotations:
"openshift.io/display-name": Prometheus - Alertmanager
description: |
A monitoring solution for an OpenShift cluster - collect and gather metrics and alerts from nodes, services, and the infrastructure. This is a tech preview feature.
iconClass: fa fa-cogs
tags: "monitoring, prometheus, alertmanager, time-series"
parameters:
- name: APP_NAME
description: "Value for app label."
- name: NAME_SPACE
description: "The name of the namespace (openshift project)"
- name: REPLICAS
description: "number of Alertmanager replicas"
- name: VOLUME_CAPACITY
displayName: Volume Capacity
description: Volume space available for data, e.g. 512Mi, 2Gi.
value: 5Gi
required: true
objects:
- apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: alertmanager-data
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: "${VOLUME_CAPACITY}"
- apiVersion: v1
kind: ServiceAccount
metadata:
name: alertmanager
- apiVersion: v1
kind: Service
name: alertmanager
metadata:
labels:
app: ${APP_NAME}
name: alertmanager
namespace: "${NAME_SPACE}"
spec:
ports:
- name: alertmanager
port: 9093
protocol: TCP
targetPort: alert-port
selector:
app: ${APP_NAME}
- apiVersion: v1
kind: Route
metadata:
annotations:
labels:
app: ${APP_NAME}
name: alertmanager
namespace: "${NAME_SPACE}"
spec:
port:
targetPort: alertmanager
to:
kind: Service
name: alertmanager
weight: 100
tls:
termination: edge
wildcardPolicy: None
- apiVersion: apps.openshift.io/v1
kind: DeploymentConfig
metadata:
labels:
app: ${APP_NAME}
name: ${APP_NAME}
namespace: "${NAME_SPACE}"
spec:
replicas: 1
selector:
app: ${APP_NAME}
template:
metadata:
labels:
app: ${APP_NAME}
name: ${APP_NAME}
spec:
serviceAccountName: alertmanager
containers:
- name: alertmanager
args:
- --storage.path=/alertmanager/data/
- --config.file=/etc/alertmanager/alertmanager.yml
- --web.external-url=https://alertmanager-demo-app.org.com
image: prom/alertmanager:v0.21.0
resources:
limits:
cpu: "500m"
memory: "128Mi"
requests:
cpu: "250m"
memory: "64Mi"
ports:
- name: alert-port
containerPort: 9093
- name: cluster-port
containerPort: 9094
livenessProbe:
failureThreshold: 3
httpGet:
path: /-/healthy
port: alert-port
scheme: HTTP
initialDelaySeconds: 40
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
readinessProbe:
failureThreshold: 3
httpGet:
path: /-/ready
port: alert-port
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
volumeMounts:
- name: alertmanager-data-volume
mountPath: /alertmanager/data/
- name: alertmanager-config-dir
mountPath: /etc/alertmanager
- name: configmap-reload
image: jimmidyson/configmap-reload:v0.4.0
imagePullPolicy: "IfNotPresent"
args:
- --volume-dir=/etc/alertmanager
- --webhook-url=http://localhost:9093/-/reload
volumeMounts:
- name: alertmanager-config-dir
mountPath: /etc/alertmanager
readOnly: true
volumes:
- name: alertmanager-config-dir
configMap:
defaultMode: 420
items:
- key: alertYaml
path: alertmanager.yml
name: alertmanager-config-map
- name: alertmanager-data-volume
persistentVolumeClaim:
claimName: alertmanager-data
- apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-config-map
data:
alertYaml: |-
---
# Alerting rules
#
# Required labels:
# alertname
# severity (critical, warning, information)
# service (prometheus, okd, rabbit, redis, kafka, application)
# scope (monitoring, infrastructure, messaging, db)
# Optional Labels:
# target (downstream)
# environment (stage, production)
global:
# The smarthost and SMTP sender used for mail notifications.
smtp_smarthost: 'smtp-prod-org.com:25'
smtp_from: '[email protected]'
smtp_require_tls: False
# The root route on which each incoming alert enters.
route:
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
group_by: ['alertname', 'severity', 'scope']
# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s
# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m
# If an alert has successfully been sent, wait 'repeat_interval' to
# resend them.
repeat_interval: 4h
# The root route must not have any matchers as it is the entry point for
# all alerts. It needs to have a receiver configured so alerts that do not
# match any of the sub-routes are sent to someone.
# severity page - will only send email to apps team
# severity alert - will send email and pager duty notifications to apps team
# severity notification - will send email notification to pep team
# severity warning - will send email and pager duty notification to pep team
receiver: 'catch-all-email-receiver'
routes:
- match:
environment: stage
repeat_interval: 8h
receiver: 'non-prod-email-receiver'
continue: false
# Literally anything with the word 'down'
- match_re:
alertname: ^(Down|down)$
repeat_interval: 2h
receiver: 'infra-email-receiver'
continue: true # Whether an alert should continue matching subsequent sibling nodes. default is false
receivers:
- name: 'non-prod-email-receiver'
email_configs:
- to: '[email protected]'
from: '[email protected]'
send_resolved: true
- name: 'critical-email-receiver'
email_configs:
- to: '[email protected]'
from: '[email protected]'
send_resolved: true # Whether or not to notify about resolved alerts. default is false
- name: 'infra-email-receiver'
email_configs:
- to: '[email protected]'
from: '[email protected]'
send_resolved: true
- name: 'catch-all-email-receiver'
email_configs:
- to: '[email protected]'
send_resolved: true
Prometheus DeploymentConfig
apiVersion: v1
kind: Template
metadata:
name: prometheus
annotations:
"openshift.io/display-name": Prometheus
description: |
A monitoring solution for an OpenShift cluster - collect and gather metrics and alerts from nodes, services, and the infrastructure. This is a tech preview feature.
iconClass: fa fa-cogs
tags: "monitoring, prometheus, time-series"
parameters:
- name: APP_NAME
description: "Value for app label."
- name: NAME_SPACE
description: "The name of the namespace (Openshift project)"
objects:
- apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: "${NAME_SPACE}"
- apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: prometheus
rules:
- apiGroups:
- ''
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: prometheus
roleRef:
name: prometheus
apiGroup: rbac.authorization.k8s.io
kind: Role
subjects:
- kind: ServiceAccount
name: prometheus
namespace: "${NAME_SPACE}"
# Create a fully end-to-end TLS connection to the prometheus proxy
- apiVersion: route.openshift.io/v1
kind: Route
metadata:
name: prometheus
namespace: "${NAME_SPACE}"
spec:
port:
targetPort: prometheus
to:
kind: Service
name: prometheus
weight: 100
tls:
termination: edge
wildcardPolicy: None
- apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: "true"
prometheus.io/scheme: https
labels:
name: prometheus
name: prometheus
namespace: "${NAME_SPACE}"
spec:
ports:
- name: prometheus
port: 9090
protocol: TCP
targetPort: prometheus-port
selector:
app: prometheus
- apiVersion: apps/v1beta1
kind: StatefulSet
metadata:
labels:
app: prometheus
name: prometheus
namespace: "${NAME_SPACE}"
spec:
updateStrategy:
type: RollingUpdate
podManagementPolicy: Parallel
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
name: prometheus
spec:
serviceAccountName: prometheus
containers:
- name: prometheus
args:
- --storage.tsdb.retention=30d
- --storage.tsdb.min-block-duration=2m
- --config.file=/etc/prometheus/prometheus.yml
- --web.enable-lifecycle
- --web.external-url=https://prometheus-demo-app.com
image: prom/prometheus:v2.23.0
imagePullPolicy: IfNotPresent
ports:
- name: prometheus-port
containerPort: 9090
livenessProbe:
failureThreshold: 3
httpGet:
path: /-/healthy
port: prometheus-port
scheme: HTTP
initialDelaySeconds: 60
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
readinessProbe:
failureThreshold: 3
httpGet:
path: /-/ready
port: prometheus-port
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
volumeMounts:
- mountPath: /etc/prometheus
name: prometheus-config
- mountPath: /prometheus
name: prometheus-data
- name: configmap-reload
image: jimmidyson/configmap-reload:v0.4.0
imagePullPolicy: "IfNotPresent"
args:
- --volume-dir=/etc/prometheus
- --webhook-url=http://localhost:9090/-/reload
volumeMounts:
- name: prometheus-config
mountPath: /etc/prometheus
readOnly: true
restartPolicy: Always
volumes:
- name: prometheus-config
configMap:
defaultMode: 420
name: prometheus
volumeClaimTemplates:
- metadata:
name: prometheus-data
spec:
accessModes: [ "ReadWriteMany" ]
resources:
requests:
storage: 15Gi
- apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus
namespace: "${NAME_SPACE}"
data:
alerting.rules: |
# Alerting rules
#
# Required labels:
# alertname
# severity (critical, warning, information)
# service (prometheus, okd, rabbit, redis, kafka, application)
# scope (monitoring, infrastructure, messaging, db)
# Optional Labels:
# target (downstream)
# environment (stage, production)
groups:
- name: Prometheus
interval: 30s # defaults to global interval
rules:
- alert: PrometheusJobMissing
expr: absent(up{job="prometheus"})
for: 5m
labels:
severity: critical
service: prometheus
scope: monitoring
annotations:
summary: Prometheus job missing (instance {{ $labels.instance }})
description: A Prometheus job has disappeared
- alert: PrometheusAllTargetsMissing
expr: count (up) by (job) == 0
for: 5m
labels:
severity: warning
service: prometheus
scope: monitoring
annotations:
summary: Prometheus all targets missing (instance {{ $labels.instance }})
description: A Prometheus job does not have living target anymore.
- alert: PrometheusConfigurationReloadFailure
expr: prometheus_config_last_reload_successful != 1
for: 5m
labels:
severity: warning
service: prometheus
scope: monitoring
annotations:
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
description: Prometheus configuration reload error
- alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|kubernetes-pods|kafka-prod|dependent_apps|alertmanager"}[15m]) > 2
for: 5m
labels:
severity: warning
service: prometheus
scope: monitoring
annotations:
summary: Prometheus too many restarts (instance {{ $labels.instance }})
description: Prometheus has restarted more than twice in the last 15 minutes. It might be crash looping.
- name: Applications
interval: 30s # defaults to global interval
rules:
- alert: JvmOutOfMemory
expr: jvm_memory_used_bytes / jvm_memory_max_bytes * 100 > 90
for: 5m
labels:
severity: warning
service: okd
scope: infrastructure
annotations:
title: JVM out of memory
description: JVM is running out of memory (> 90%)
- alert: ProcessCpuUsage
expr: process_cpu_usage * 100 > 80
for: 5m
labels:
severity: warning
service: okd
scope: infrastructure
annotations:
summary: "Process CPU for {{ $labels.job }} is above 80%"
- alert: FailedHttpRequestsFromApplication
expr: sum by (kubernetes_pod_name, clientName, method, uri, status, outcome) (rate(http_client_requests_seconds_count{status!~"^[2-3][0-9][0-9]$"}[5m]))
for: 5m
labels:
severity: warning
service: application
scope: infrastructure
target: downstream
annotations:
summary: HTTP Requests failed for Host = {{ $labels.clientName }}
- alert: FailedHttpRequestsToActuator
expr: sum by (kubernetes_pod_name, clientName, method, uri, status, outcome)(rate(http_server_requests_seconds_count{uri=~".*actuator.*", status!~"^[2-3][0-9][0-9]$"}[5m]))
for: 5m
labels:
severity: warning
service: application
scope: infrastructure
annotations:
summary: HTTP Requests failed from Host = {{ $labels.clientName }}
- alert: FailedHttpRequestsToApplication
expr: sum by (kubernetes_pod_name, clientName, method, uri, status, outcome)(rate(http_server_requests_seconds_count{uri!~".*actuator.*", status!~"^[2-3][0-9][0-9]$"}[5m]))
labels:
severity: warning
service: application
scope: infrastructure
annotations:
summary: HTTP Requests failed from Host = {{ $labels.clientName }}
- name: Rabbit MQ
interval: 30s # defaults to global interval
rules:
- alert: RabbitmqNodeDown
expr: sum(rabbitmq_build_info) < 3
for: 5m
labels:
severity: critical
service: rabbit
scope: messaging
annotations:
title: Rabbitmq node down for instance {{ $labels.instance }}
description: Less than 3 nodes running in RabbitMQ cluster
- alert: RabbitmqNodeNotDistributed
expr: erlang_vm_dist_node_state < 3
for: 5m
labels:
severity: critical
service: rabbit
scope: messaging
annotations:
title: Rabbitmq node not distributed for instance {{ $labels.instance }}
description: Distribution link state is not 'up'
- alert: RabbitmqInstancesDifferentVersions
expr: count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1
for: 5m
labels:
severity: warning
service: rabbit
scope: messaging
annotations:
title: Rabbitmq instances different versions for instance {{ $labels.instance }}
description: Running different version of Rabbitmq in the same cluster, can lead to failure.
- alert: RabbitmqMemoryHigh
expr: rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90
for: 5m
labels:
severity: warning
service: rabbit
scope: infrastructure
annotations:
title: Rabbitmq memory high for instance {{ $labels.instance }}
description: A node use more than 90% of allocated RAM
- alert: RabbitmqFileDescriptorsUsage
expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90
for: 5m
labels:
severity: warning
service: rabbit
scope: infrastructure
annotations:
title: Rabbitmq file descriptors usage for instance {{ $labels.instance }}
description: A node use more than 90% of file descriptors
- alert: RabbitmqTooMuchUnack
expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000
for: 5m
labels:
severity: warning
service: rabbit
scope: messaging
annotations:
title: Rabbitmq too much unack for instance {{ $labels.instance }}
description: Too much unacknowledged messages
- alert: RabbitmqTooMuchConnections
expr: rabbitmq_connections > 1000
for: 5m
labels:
severity: warning
service: rabbit
scope: messaging
annotations:
title: Rabbitmq too much connections for instance {{ $labels.instance }}
description: The total connections of a node is too high
- alert: RabbitmqNoQueueConsumer
expr: rabbitmq_queue_consumers < 1
for: 5m
labels:
severity: information
service: rabbit
scope: messaging
annotations:
title: Rabbitmq no queue consumer for instance {{ $labels.instance }}
description: A queue has less than 1 consumer
- alert: RabbitmqUnroutableMessages
expr: increase(rabbitmq_channel_messages_unroutable_returned_total[5m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[5m]) > 0
for: 5m
labels:
severity: warning
service: rabbit
scope: messaging
annotations:
title: Rabbitmq unroutable messages for instance {{ $labels.instance }}
description: A queue has unroutable messages
- name: Kubernetes PODs Down
interval: 30s # defaults to global interval
rules:
- alert: PodDown
expr: up{job="kubernetes-pods"} == 0
for: 1m
labels:
severity: critical
service: okd
scope: infrastructure
annotations:
title: "{{$labels.kubernetes_pod_name}} is down on {{$labels.kubernetes_namespace}}"
- name: Redis
interval: 30s # defaults to global interval
rules:
- alert: RedisInstanceDown
expr: redis_up == 0
for: 1m
labels:
severity: critical
service: redis
scope: db
annotations:
title: Redis instance is down
description: Redis is down at {{ $labels.instance }} for 1 minute.
- alert: RedisClusterDown
expr: min(redis_cluster_state) == 0
for: 1m
labels:
severity: critical
service: redis
scope: db
annotations:
title: Redis cluster is down
description: Redis cluster is down at {{ $labels.instance }} for 1 minute.
- alert: RedisMissingMaster
expr: ( count (redis_instance_info{role="master"} ) by (role) ) < 3
for: 1m
labels:
severity: critical
service: redis
scope: db
annotations:
title: Redis missing master
description: Redis cluster has less than 3 masters
- alert: RedisTooManyMasters
expr: count (redis_instance_info{role="master"} ) by (role) > 3
for: 1m
labels:
severity: critical
service: redis
scope: db
annotations:
title: Redis too many masters at instance {{ $labels.instance }}
description: Redis cluster has too many nodes marked as master
- alert: RedisDisconnectedSlaves
expr: ( sum without (instance, statefulset_kubernetes_io_pod_name, controller_revision_hash, kubernetes_pod_name) (redis_connected_slaves) ) > 3
for: 1m
labels:
severity: critical
service: redis
scope: db
annotations:
title: Redis disconnected slaves for instance {{ $labels.instance }}
description: Redis not replicating for all slaves. Consider reviewing the redis replication status.
- alert: RedisReplicationBroken
expr: delta(redis_connected_slaves[10m]) < 0
for: 10m
labels:
severity: critical
service: redis
scope: db
annotations:
title: Redis replication broken for instance {{ $labels.instance }}
description: Redis instance lost a slave
- alert: RedisClusterFlapping
expr: ( changes(redis_connected_slaves[10m]) > 2 ) < 0
for: 10m
labels:
severity: critical
service: redis
scope: db
annotations:
title: Redis cluster flapping
description: Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).
# - alert: RedisMissingBackup
# expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
# for: 10m
# labels:
# severity: critical
# annotations:
# title: Redis missing backup for instance {{ $labels.instance }}
# description: Redis has not been backuped for 24 hours
- alert: RedisOutOfMemory
expr: ( redis_memory_used_bytes / redis_memory_max_bytes * 100 ) > 90
for: 5m
labels:
severity: warning
service: redis
scope: db
annotations:
title: Redis out of memory at instance {{ $labels.instance }}
description: Redis is running out of memory (> 90%)
- alert: RedisNotEnoughConnections
expr: redis_connected_clients < 3
for: 5m
labels:
severity: information
service: redis
scope: db
annotations:
title: Redis not enough connections
description: Redis instance should have more connections (> 5)
- alert: RedisTooManyConnections
expr: redis_connected_clients > 100
for: 5m
labels:
severity: warning
service: redis
scope: db
annotations:
title: Redis too many connections at instance {{ $labels.instance }}
description: Redis instance has too many connections
- alert: RedisRejectedConnections
expr: increase(redis_rejected_connections_total[5m]) > 0
for: 5m
labels:
severity: critical
service: redis
scope: db
annotations:
title: Redis rejected connections at instance {{ $labels.instance }}
description: Some connections to Redis has been rejected
- name: Kafka
interval: 30s # defaults to global interval
rules:
- alert: KafkaLagStage
expr: sum(kafka_consumergroup_lag{consumergroup=~"stage-condumer-s.+"}) by (consumergroup, topic) > 100
for: 5m
labels:
severity: warning
service: kafka
scope: messaging
environment: stage
annotations:
title: Kafka Consumer Lag in Stage
description: There is a huge lag = {{ $value }} for topic = {{ $labels.topic }} and consumer group = {{ $labels.consumergroup }}
- alert: KafkaLagProd
expr: sum(kafka_consumergroup_lag{consumergroup=~"production-consumder-p.+"}) by (consumergroup, topic) > 100
for: 5m
labels:
severity: critical
service: kafka
scope: messaging
environment: production
annotations:
title: Kafka Consumer Lag in Production
description: There is a huge lag = {{ $value }} for topic = {{ $labels.topic }} and consumer group = {{ $labels.consumergroup }}
- alert: Kafka Topics Replicas
expr: sum(kafka_topic_partition_in_sync_replica) by (topic) < 3
for: 5m
labels:
severity: critical
service: kafka
scope: messaging
annotations:
summary: Kafka topics replicas less than 3
description: Kafka topics replicas is {{ $value }} for topic = {{ $labels.topic }} and consumer group = {{ $labels.consumergroup }}
- name: Exporters
interval: 30s # defaults to global interval
rules:
- alert: KafkaExporter
expr: up{instance=~"kafka-.+", job="kafka-prod"} == 0
for: 3m
labels:
severity: warning
service: kafka
scope: infrastructure
annotations:
title: Kafka Exporter is Down
description: Kafka Exporter is down on {{ $labels.instance }}. Could not scrape kafka-exporter for 3m.
- alert: BlackboxProbeFailed
expr: probe_success == 0
for: 5m
labels:
severity: warning
service: prometheus
scope: infrastructure
annotations:
title: Blackbox probe failed for instance {{ $labels.instance }}
- alert: BlackboxSlowProbe
expr: avg_over_time(probe_duration_seconds[1m]) > 1
for: 5m
labels:
severity: warning
service: prometheus
scope: infrastructure
annotations:
title: Blackbox slow probe for instance {{ $labels.instance }}
description: Blackbox probe took more than 1s to complete
- alert: BlackboxProbeHttpFailure
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
for: 5m
labels:
severity: warning
service: application
scope: messaging
target: downstream
annotations:
title: Blackbox probe HTTP failure instance {{ $labels.instance }}
description: HTTP status code is not 200-399
- alert: BlackboxSslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
for: 5m
labels:
severity: critical
service: application
scope: infrastructure
annotations:
title: Blackbox SSL certificate will expire soon for instance {{ $labels.instance }}
description: SSL certificate expires in 30 days
- alert: BlackboxSslCertificateExpired
expr: probe_ssl_earliest_cert_expiry - time() <= 0
for: 5m
labels:
severity: critical
service: application
scope: infrastructure
annotations:
title: Blackbox SSL certificate expired for instance {{ $labels.instance }}
description: SSL certificate has expired already
- alert: BlackboxProbeSlowHttp
expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
for: 5m
labels:
severity: warning
service: prometheus
scope: monitoring
annotations:
title: Blackbox probe slow HTTP for instance {{ $labels.instance }}
description: HTTP request took more than 1s
recording.rules: |
groups:
- name: aggregate_container_resources
rules:
- record: container_cpu_usage_rate
expr: sum without (cpu) (rate(container_cpu_usage_seconds_total[5m]))
- record: container_memory_rss_by_type
expr: container_memory_rss{id=~"/|/system.slice|/kubepods.slice"} > 0
- record: container_cpu_usage_percent_by_host
expr: sum(rate(container_cpu_usage_seconds_total{id="/"}[5m])) BY(kubernetes_io_hostname) / ON(kubernetes_io_hostname) machine_cpu_cores
- record: apiserver_request_count_rate_by_resources
expr: sum without (client,instance,contentType) (rate(apiserver_request_count[5m]))
prometheus.yml: |
rule_files:
- '*.rules'
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'alertmanager'
scheme: https
static_configs:
- targets: ['alertmanager-demo-org.com']
tls_config:
ca_file: /etc/prometheus/ca.crt
#cert_file: /etc/etcd/ssl/client.pem
#key_file: /etc/etcd/ssl/client-key.pem
insecure_skip_verify: false
# Scrape config for the pods in ${NAME_SPACE} namespace
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- ${NAME_SPACE}
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
#- job_name: 'rmq-prod'
# scrape_interval: 5s
# static_configs:
# - targets: ['rmq-metrics-demo-org.com:80']
- job_name: 'kafka-prod'
scrape_interval: 5s
static_configs:
- targets: ['kafka-exporter-demo-org.com']
scheme: https
tls_config:
ca_file: /etc/prometheus/ca.crt
- job_name: 'ldap_check'
scrape_interval: 5m
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets:
- 'ad-ldap-prod.org.com:389'
tls_config:
ca_file: /etc/prometheus/ca.crt
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter-demo-org.com # The blackbox exporter's real hostname:port.
- target_label: __scheme__
replacement: https
- job_name: 'dependent_apps'
metrics_path: /probe
scrape_interval: 1m
params:
module: [http_orgca_2xx] # Look for a HTTP 200 response.
static_configs:
- targets:
- https://dependent1.com
- https://dependent2.com
tls_config:
ca_file: /etc/prometheus/ca.crt
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter-demo-org.com # The blackbox exporter's real hostname:port.
- target_label: __scheme__
replacement: https
alerting:
alertmanagers:
- scheme: https
static_configs:
- targets:
- "alertmanager-demo-org.com"
tls_config:
ca_file: /etc/prometheus/ca.crt
ca.crt: |
-----BEGIN CERTIFICATE-----
Add certificate
-----END CERTIFICATE-----
-----BEGIN CERTIFICATE-----
Add certificate
-----END CERTIFICATE-----