cleanup newlines

electron0zero · electron0zero · commit 5b7e7c14b875 · 2025-01-23T16:58:03.000+05:30
diff --git a/operations/tempo-mixin-compiled/dashboards/tempo-reads.json b/operations/tempo-mixin-compiled/dashboards/tempo-reads.json
@@ -704,7 +704,7 @@
      "steppedLine": false,
      "targets": [
       {
-       "expr": "sum by (status) (\n  label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/tempopb.Querier/.*\"}[$__rate_interval]),\n  \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n  \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n",
+       "expr": "sum by (status) (\n  label_replace(label_replace(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval]),\n  \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n  \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n",
        "format": "time_series",
        "interval": "1m",
        "legendFormat": "{{status}}",
@@ -782,6 +782,216 @@
      "renderer": "flot",
      "seriesOverrides": [
 
+     ],
+     "spaceLength": 10,
+     "span": 6,
+     "stack": false,
+     "steppedLine": false,
+     "targets": [
+      {
+       "expr": "histogram_quantile(0.99, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (le,endpoint)) * 1e3",
+       "format": "time_series",
+       "interval": "1m",
+       "intervalFactor": 2,
+       "legendFormat": "{{route}} 99th",
+       "refId": "A",
+       "step": 10
+      },
+      {
+       "expr": "histogram_quantile(0.50, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (le,endpoint)) * 1e3",
+       "format": "time_series",
+       "interval": "1m",
+       "intervalFactor": 2,
+       "legendFormat": "{{route}} 50th",
+       "refId": "B",
+       "step": 10
+      },
+      {
+       "expr": "sum(rate(tempo_querier_external_endpoint_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (endpoint) * 1e3 / sum(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (endpoint)",
+       "format": "time_series",
+       "interval": "1m",
+       "intervalFactor": 2,
+       "legendFormat": "{{route}} Average",
+       "refId": "C",
+       "step": 10
+      }
+     ],
+     "thresholds": [
+
+     ],
+     "timeFrom": null,
+     "timeShift": null,
+     "title": "Latency",
+     "tooltip": {
+      "shared": true,
+      "sort": 2,
+      "value_type": "individual"
+     },
+     "type": "graph",
+     "xaxis": {
+      "buckets": null,
+      "mode": "time",
+      "name": null,
+      "show": true,
+      "values": [
+
+      ]
+     },
+     "yaxes": [
+      {
+       "format": "ms",
+       "label": null,
+       "logBase": 1,
+       "max": null,
+       "min": 0,
+       "show": true
+      },
+      {
+       "format": "short",
+       "label": null,
+       "logBase": 1,
+       "max": null,
+       "min": null,
+       "show": false
+      }
+     ]
+    }
+   ],
+   "repeat": null,
+   "repeatIteration": null,
+   "repeatRowId": null,
+   "showTitle": true,
+   "title": "Querier External Endpoint",
+   "titleSize": "h6"
+  },
+  {
+   "collapse": false,
+   "height": "250px",
+   "panels": [
+    {
+     "aliasColors": {
+      "1xx": "#EAB839",
+      "2xx": "#7EB26D",
+      "3xx": "#6ED0E0",
+      "4xx": "#EF843C",
+      "5xx": "#E24D42",
+      "OK": "#7EB26D",
+      "cancel": "#A9A9A9",
+      "error": "#E24D42",
+      "success": "#7EB26D"
+     },
+     "bars": false,
+     "dashLength": 10,
+     "dashes": false,
+     "datasource": "$datasource",
+     "fill": 10,
+     "id": 9,
+     "legend": {
+      "avg": false,
+      "current": false,
+      "max": false,
+      "min": false,
+      "show": true,
+      "total": false,
+      "values": false
+     },
+     "lines": true,
+     "linewidth": 0,
+     "links": [
+
+     ],
+     "nullPointMode": "null as zero",
+     "percentage": false,
+     "pointradius": 5,
+     "points": false,
+     "renderer": "flot",
+     "seriesOverrides": [
+
+     ],
+     "spaceLength": 10,
+     "span": 6,
+     "stack": true,
+     "steppedLine": false,
+     "targets": [
+      {
+       "expr": "sum by (status) (\n  label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/tempopb.Querier/.*\"}[$__rate_interval]),\n  \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n  \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n",
+       "format": "time_series",
+       "interval": "1m",
+       "legendFormat": "{{status}}",
+       "refId": "A"
+      }
+     ],
+     "thresholds": [
+
+     ],
+     "timeFrom": null,
+     "timeShift": null,
+     "title": "QPS",
+     "tooltip": {
+      "shared": true,
+      "sort": 2,
+      "value_type": "individual"
+     },
+     "type": "graph",
+     "xaxis": {
+      "buckets": null,
+      "mode": "time",
+      "name": null,
+      "show": true,
+      "values": [
+
+      ]
+     },
+     "yaxes": [
+      {
+       "format": "short",
+       "label": null,
+       "logBase": 1,
+       "max": null,
+       "min": 0,
+       "show": true
+      },
+      {
+       "format": "short",
+       "label": null,
+       "logBase": 1,
+       "max": null,
+       "min": null,
+       "show": false
+      }
+     ]
+    },
+    {
+     "aliasColors": {
+
+     },
+     "bars": false,
+     "dashLength": 10,
+     "dashes": false,
+     "datasource": "$datasource",
+     "fill": 1,
+     "id": 10,
+     "legend": {
+      "avg": false,
+      "current": false,
+      "max": false,
+      "min": false,
+      "show": true,
+      "total": false,
+      "values": false
+     },
+     "lines": true,
+     "linewidth": 1,
+     "links": [
+
+     ],
+     "nullPointMode": "null as zero",
+     "percentage": false,
+     "pointradius": 5,
+     "points": false,
+     "renderer": "flot",
+     "seriesOverrides": [
+
      ],
      "spaceLength": 10,
      "span": 6,
@@ -885,7 +1095,7 @@
      "dashes": false,
      "datasource": "$datasource",
      "fill": 10,
-     "id": 9,
+     "id": 11,
      "legend": {
       "avg": false,
       "current": false,
@@ -970,7 +1180,7 @@
      "dashes": false,
      "datasource": "$datasource",
      "fill": 1,
-     "id": 10,
+     "id": 12,
      "legend": {
       "avg": false,
       "current": false,
@@ -1095,7 +1305,7 @@
      "dashes": false,
      "datasource": "$datasource",
      "fill": 10,
-     "id": 11,
+     "id": 13,
      "legend": {
       "avg": false,
       "current": false,
@@ -1180,7 +1390,7 @@
      "dashes": false,
      "datasource": "$datasource",
      "fill": 1,
-     "id": 12,
+     "id": 14,
      "legend": {
       "avg": false,
       "current": false,
diff --git a/operations/tempo-mixin/dashboards/tempo-reads.libsonnet b/operations/tempo-mixin/dashboards/tempo-reads.libsonnet
@@ -39,6 +39,17 @@ dashboard_utils {
           $.latencyPanel('tempo_request_duration_seconds', '{%s,route=~"querier_%sapi_.*"}' % [$.jobMatcher($._config.jobs.querier), $._config.http_api_prefix], additional_grouping='route')
         )
       )
+      .addRow(
+        g.row('Querier External Endpoint')
+        .addPanel(
+          $.panel('QPS') +
+          $.qpsPanel('tempo_querier_external_endpoint_duration_seconds_count{%s}' % [$.jobMatcher($._config.jobs.querier)])
+        )
+        .addPanel(
+          $.panel('Latency') +
+          $.latencyPanel('tempo_querier_external_endpoint_duration_seconds', '{%s}' % [$.jobMatcher($._config.jobs.querier)], additional_grouping='endpoint')
+        )
+      )
       .addRow(
         g.row('Ingester')
         .addPanel(
diff --git a/operations/tempo-mixin/runbook.md b/operations/tempo-mixin/runbook.md
@@ -45,9 +45,6 @@ Another way to increase parallelism is by increasing the size of the worker pool
 A theoretically ideal value for this config to avoid _any_ queueing would be (Size of blocklist / Max Concurrent Queries).
 But also factor in the resources provided to the querier.
 
-Our [documentation](https://grafana.com/docs/tempo/latest/operations/backend_search/#query-frontend)
-includes [a solid guide](https://grafana.com/docs/tempo/latest/operations/backend_search/#guidelines-on-key-configuration-parameters) on the various parameters with suggestions.
-
 ### Trace Lookup Failures
 
 If trace lookups are fail with the error: `error querying store in Querier.FindTraceByID: queue doesn't have room for <xyz> jobs`, this 
@@ -73,6 +70,24 @@ Consider the following resolutions:
 - Increase the queue_depth size to do more work per querier
 - Adjust compaction settings to reduce the number of blocks
 
+### Serverless/External Endpoints
+
+If the request latency issues are due to backend searches with serverless/external endpoints there may be additional configuration
+options that will help decrease latency. Before you get started know that serverless functionality only impacts the `/api/search` endpoint 
+if a `start` and `end` parameter are passed and `external_endpoints` are configured on the querier. One way to determine if external
+endpoints are getting hit is to check the Reads dashboard and look for the "Querier External Endpoint" row.
+
+Tuning serverless search can be difficult. Our [public documentation](https://grafana.com/docs/tempo/latest/operations/backend_search/#query-frontend)
+includes a solid guide on the various parameters with suggestions. The linked documentation is augmented with some suggestions here:
+
+- Consider provisioning more serverless functions and adding them to the `querier.search.external_endpoints` array. This will increase your 
+  baseline latency and your total throughput.
+- Decreasing `querier.search.hedge_requests_at` and increasing `querier.search.hedge_requests_up_to` will put more pressure on the serverless endpoints but will
+  result in lower latency.
+- Increasing `querier.search.prefer_self` and scaling up the queriers will cause more work to be performed by the queriers which will lower latencies.
+- Increasing `query_frontend.max_oustanding_per_tenant` and `query_frontend.search.concurrent_jobs` will increase the rate at which the
+  query_frontend tries to feed jobs to the queriers and can decrease latency.
+
 ## TempoCompactorUnhealthy
 
 If this occurs access the [ring page](https://grafana.com/docs/tempo/latest/operations/consistent_hash_ring/) at `/compactor/ring`.