Skip to content

Commit 5b7e7c1

Browse files
committed
cleanup newlines
1 parent a760257 commit 5b7e7c1

File tree

3 files changed

+244
-8
lines changed

3 files changed

+244
-8
lines changed

operations/tempo-mixin-compiled/dashboards/tempo-reads.json

Lines changed: 215 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -704,7 +704,7 @@
704704
"steppedLine": false,
705705
"targets": [
706706
{
707-
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/tempopb.Querier/.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n",
707+
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n",
708708
"format": "time_series",
709709
"interval": "1m",
710710
"legendFormat": "{{status}}",
@@ -782,6 +782,216 @@
782782
"renderer": "flot",
783783
"seriesOverrides": [
784784

785+
],
786+
"spaceLength": 10,
787+
"span": 6,
788+
"stack": false,
789+
"steppedLine": false,
790+
"targets": [
791+
{
792+
"expr": "histogram_quantile(0.99, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (le,endpoint)) * 1e3",
793+
"format": "time_series",
794+
"interval": "1m",
795+
"intervalFactor": 2,
796+
"legendFormat": "{{route}} 99th",
797+
"refId": "A",
798+
"step": 10
799+
},
800+
{
801+
"expr": "histogram_quantile(0.50, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (le,endpoint)) * 1e3",
802+
"format": "time_series",
803+
"interval": "1m",
804+
"intervalFactor": 2,
805+
"legendFormat": "{{route}} 50th",
806+
"refId": "B",
807+
"step": 10
808+
},
809+
{
810+
"expr": "sum(rate(tempo_querier_external_endpoint_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (endpoint) * 1e3 / sum(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (endpoint)",
811+
"format": "time_series",
812+
"interval": "1m",
813+
"intervalFactor": 2,
814+
"legendFormat": "{{route}} Average",
815+
"refId": "C",
816+
"step": 10
817+
}
818+
],
819+
"thresholds": [
820+
821+
],
822+
"timeFrom": null,
823+
"timeShift": null,
824+
"title": "Latency",
825+
"tooltip": {
826+
"shared": true,
827+
"sort": 2,
828+
"value_type": "individual"
829+
},
830+
"type": "graph",
831+
"xaxis": {
832+
"buckets": null,
833+
"mode": "time",
834+
"name": null,
835+
"show": true,
836+
"values": [
837+
838+
]
839+
},
840+
"yaxes": [
841+
{
842+
"format": "ms",
843+
"label": null,
844+
"logBase": 1,
845+
"max": null,
846+
"min": 0,
847+
"show": true
848+
},
849+
{
850+
"format": "short",
851+
"label": null,
852+
"logBase": 1,
853+
"max": null,
854+
"min": null,
855+
"show": false
856+
}
857+
]
858+
}
859+
],
860+
"repeat": null,
861+
"repeatIteration": null,
862+
"repeatRowId": null,
863+
"showTitle": true,
864+
"title": "Querier External Endpoint",
865+
"titleSize": "h6"
866+
},
867+
{
868+
"collapse": false,
869+
"height": "250px",
870+
"panels": [
871+
{
872+
"aliasColors": {
873+
"1xx": "#EAB839",
874+
"2xx": "#7EB26D",
875+
"3xx": "#6ED0E0",
876+
"4xx": "#EF843C",
877+
"5xx": "#E24D42",
878+
"OK": "#7EB26D",
879+
"cancel": "#A9A9A9",
880+
"error": "#E24D42",
881+
"success": "#7EB26D"
882+
},
883+
"bars": false,
884+
"dashLength": 10,
885+
"dashes": false,
886+
"datasource": "$datasource",
887+
"fill": 10,
888+
"id": 9,
889+
"legend": {
890+
"avg": false,
891+
"current": false,
892+
"max": false,
893+
"min": false,
894+
"show": true,
895+
"total": false,
896+
"values": false
897+
},
898+
"lines": true,
899+
"linewidth": 0,
900+
"links": [
901+
902+
],
903+
"nullPointMode": "null as zero",
904+
"percentage": false,
905+
"pointradius": 5,
906+
"points": false,
907+
"renderer": "flot",
908+
"seriesOverrides": [
909+
910+
],
911+
"spaceLength": 10,
912+
"span": 6,
913+
"stack": true,
914+
"steppedLine": false,
915+
"targets": [
916+
{
917+
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/tempopb.Querier/.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n",
918+
"format": "time_series",
919+
"interval": "1m",
920+
"legendFormat": "{{status}}",
921+
"refId": "A"
922+
}
923+
],
924+
"thresholds": [
925+
926+
],
927+
"timeFrom": null,
928+
"timeShift": null,
929+
"title": "QPS",
930+
"tooltip": {
931+
"shared": true,
932+
"sort": 2,
933+
"value_type": "individual"
934+
},
935+
"type": "graph",
936+
"xaxis": {
937+
"buckets": null,
938+
"mode": "time",
939+
"name": null,
940+
"show": true,
941+
"values": [
942+
943+
]
944+
},
945+
"yaxes": [
946+
{
947+
"format": "short",
948+
"label": null,
949+
"logBase": 1,
950+
"max": null,
951+
"min": 0,
952+
"show": true
953+
},
954+
{
955+
"format": "short",
956+
"label": null,
957+
"logBase": 1,
958+
"max": null,
959+
"min": null,
960+
"show": false
961+
}
962+
]
963+
},
964+
{
965+
"aliasColors": {
966+
967+
},
968+
"bars": false,
969+
"dashLength": 10,
970+
"dashes": false,
971+
"datasource": "$datasource",
972+
"fill": 1,
973+
"id": 10,
974+
"legend": {
975+
"avg": false,
976+
"current": false,
977+
"max": false,
978+
"min": false,
979+
"show": true,
980+
"total": false,
981+
"values": false
982+
},
983+
"lines": true,
984+
"linewidth": 1,
985+
"links": [
986+
987+
],
988+
"nullPointMode": "null as zero",
989+
"percentage": false,
990+
"pointradius": 5,
991+
"points": false,
992+
"renderer": "flot",
993+
"seriesOverrides": [
994+
785995
],
786996
"spaceLength": 10,
787997
"span": 6,
@@ -885,7 +1095,7 @@
8851095
"dashes": false,
8861096
"datasource": "$datasource",
8871097
"fill": 10,
888-
"id": 9,
1098+
"id": 11,
8891099
"legend": {
8901100
"avg": false,
8911101
"current": false,
@@ -970,7 +1180,7 @@
9701180
"dashes": false,
9711181
"datasource": "$datasource",
9721182
"fill": 1,
973-
"id": 10,
1183+
"id": 12,
9741184
"legend": {
9751185
"avg": false,
9761186
"current": false,
@@ -1095,7 +1305,7 @@
10951305
"dashes": false,
10961306
"datasource": "$datasource",
10971307
"fill": 10,
1098-
"id": 11,
1308+
"id": 13,
10991309
"legend": {
11001310
"avg": false,
11011311
"current": false,
@@ -1180,7 +1390,7 @@
11801390
"dashes": false,
11811391
"datasource": "$datasource",
11821392
"fill": 1,
1183-
"id": 12,
1393+
"id": 14,
11841394
"legend": {
11851395
"avg": false,
11861396
"current": false,

operations/tempo-mixin/dashboards/tempo-reads.libsonnet

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,17 @@ dashboard_utils {
3939
$.latencyPanel('tempo_request_duration_seconds', '{%s,route=~"querier_%sapi_.*"}' % [$.jobMatcher($._config.jobs.querier), $._config.http_api_prefix], additional_grouping='route')
4040
)
4141
)
42+
.addRow(
43+
g.row('Querier External Endpoint')
44+
.addPanel(
45+
$.panel('QPS') +
46+
$.qpsPanel('tempo_querier_external_endpoint_duration_seconds_count{%s}' % [$.jobMatcher($._config.jobs.querier)])
47+
)
48+
.addPanel(
49+
$.panel('Latency') +
50+
$.latencyPanel('tempo_querier_external_endpoint_duration_seconds', '{%s}' % [$.jobMatcher($._config.jobs.querier)], additional_grouping='endpoint')
51+
)
52+
)
4253
.addRow(
4354
g.row('Ingester')
4455
.addPanel(

operations/tempo-mixin/runbook.md

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,6 @@ Another way to increase parallelism is by increasing the size of the worker pool
4545
A theoretically ideal value for this config to avoid _any_ queueing would be (Size of blocklist / Max Concurrent Queries).
4646
But also factor in the resources provided to the querier.
4747
48-
Our [documentation](https://grafana.com/docs/tempo/latest/operations/backend_search/#query-frontend)
49-
includes [a solid guide](https://grafana.com/docs/tempo/latest/operations/backend_search/#guidelines-on-key-configuration-parameters) on the various parameters with suggestions.
50-
5148
### Trace Lookup Failures
5249
5350
If trace lookups are fail with the error: `error querying store in Querier.FindTraceByID: queue doesn't have room for <xyz> jobs`, this
@@ -73,6 +70,24 @@ Consider the following resolutions:
7370
- Increase the queue_depth size to do more work per querier
7471
- Adjust compaction settings to reduce the number of blocks
7572
73+
### Serverless/External Endpoints
74+
75+
If the request latency issues are due to backend searches with serverless/external endpoints there may be additional configuration
76+
options that will help decrease latency. Before you get started know that serverless functionality only impacts the `/api/search` endpoint
77+
if a `start` and `end` parameter are passed and `external_endpoints` are configured on the querier. One way to determine if external
78+
endpoints are getting hit is to check the Reads dashboard and look for the "Querier External Endpoint" row.
79+
80+
Tuning serverless search can be difficult. Our [public documentation](https://grafana.com/docs/tempo/latest/operations/backend_search/#query-frontend)
81+
includes a solid guide on the various parameters with suggestions. The linked documentation is augmented with some suggestions here:
82+
83+
- Consider provisioning more serverless functions and adding them to the `querier.search.external_endpoints` array. This will increase your
84+
baseline latency and your total throughput.
85+
- Decreasing `querier.search.hedge_requests_at` and increasing `querier.search.hedge_requests_up_to` will put more pressure on the serverless endpoints but will
86+
result in lower latency.
87+
- Increasing `querier.search.prefer_self` and scaling up the queriers will cause more work to be performed by the queriers which will lower latencies.
88+
- Increasing `query_frontend.max_oustanding_per_tenant` and `query_frontend.search.concurrent_jobs` will increase the rate at which the
89+
query_frontend tries to feed jobs to the queriers and can decrease latency.
90+
7691
## TempoCompactorUnhealthy
7792
7893
If this occurs access the [ring page](https://grafana.com/docs/tempo/latest/operations/consistent_hash_ring/) at `/compactor/ring`.

0 commit comments

Comments
 (0)