Skip to content

Commit 2a2c42f

Browse files
authored
Merge branch 'MicrosoftDocs:main' into patch-1
2 parents fb14134 + 244e223 commit 2a2c42f

190 files changed

Lines changed: 4461 additions & 973 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.openpublishing.redirection.json

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13697,6 +13697,54 @@
1369713697
"source_path": "support/entra/entra-id/mfa/authorization-request-denied-graph-api.md",
1369813698
"redirect_url": "/troubleshoot/entra/entra-id/app-integration/troubleshoot-authorization-requestdenied-graph-api",
1369913699
"redirect_document_id": false
13700+
},
13701+
{
13702+
"source_path": "support/dynamics-365/commerce/point-of-sale/store-commerce-sign-in.md",
13703+
"redirect_url": "/troubleshoot/dynamics-365/commerce/welcome-commerce"
13704+
},
13705+
{
13706+
"source_path": "support/dynamics-365/commerce/payments/refund-miscalculated-partial-return.md",
13707+
"redirect_url": "/troubleshoot/dynamics-365/commerce/welcome-commerce"
13708+
},
13709+
{
13710+
"source_path": "support/dynamics-365/commerce/point-of-sale/troubleshoot-pos-aad-issues.md",
13711+
"redirect_url": "/troubleshoot/dynamics-365/commerce/welcome-commerce"
13712+
},
13713+
{
13714+
"source_path": "support/dynamics-365/commerce/financial-statements/z-report-issues.md",
13715+
"redirect_url": "/troubleshoot/dynamics-365/commerce/welcome-commerce"
13716+
},
13717+
{
13718+
"source_path": "support/dynamics-365/commerce/site-builder-content-management/security-group-site-builder.md",
13719+
"redirect_url": "/troubleshoot/dynamics-365/commerce/welcome-commerce"
13720+
},
13721+
{
13722+
"source_path": "support/dynamics-365/commerce/point-of-sale/store-commerce-setup-installation.md",
13723+
"redirect_url": "/previous-versions/troubleshoot/dynamics-365/commerce/point-of-sale/store-commerce-setup-installation"
13724+
},
13725+
{
13726+
"source_path": "support/dynamics-365/commerce/order-management/online-order-transaction-creation-failures.md",
13727+
"redirect_url": "/previous-versions/troubleshoot/dynamics-365/commerce/order-management/online-order-transaction-creation-failures"
13728+
},
13729+
{
13730+
"source_path": "support/dynamics-365/commerce/pricing-discounts-taxes/price-track-issues.md",
13731+
"redirect_url": "/previous-versions/troubleshoot/dynamics-365/commerce/pricing-discounts-taxes/price-track-issues"
13732+
},
13733+
{
13734+
"source_path": "support/dynamics-365/commerce/site-builder-content-management/products-categories-unavailable.md",
13735+
"redirect_url": "/previous-versions/troubleshoot/dynamics-365/commerce/site-builder-content-management/products-categories-unavailable"
13736+
},
13737+
{
13738+
"source_path": "support/dynamics-365/commerce/payments/credit-card-entry-form-error.md",
13739+
"redirect_url": "/previous-versions/troubleshoot/dynamics-365/commerce/payments/credit-card-entry-form-error"
13740+
},
13741+
{
13742+
"source_path": "support/dynamics-365/commerce/point-of-sale/store-commerce-performance.md",
13743+
"redirect_url": "/previous-versions/troubleshoot/dynamics-365/commerce/point-of-sale/store-commerce-performance"
13744+
},
13745+
{
13746+
"source_path": "support/dynamics-365/commerce/ecommerce-storefront/pickup-store-link-missing.md",
13747+
"redirect_url": "/previous-versions/troubleshoot/dynamics-365/commerce/ecommerce-storefront/pickup-store-link-missing"
1370013748
}
1370113749
]
1370213750
}

support/azure/.openpublishing.redirection.azure.json

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6320,6 +6320,40 @@
63206320
{
63216321
"source_path": "azure-kubernetes/error-codes/zonalallocation-allocatonfailed-error.md",
63226322
"redirect_url": "/troubleshoot/azure/azure-kubernetes/error-codes/zonalallocation-allocationfailed-error"
6323+
},
6324+
{
6325+
6326+
"source_path": "kubernetes-fleet/troubleshoot-clusterresourceplacement-api-issues.md",
6327+
"redirect_url": "/troubleshoot/azure/kubernetes-fleet/cluster-resource-placement/troubleshoot-clusterresourceplacement-api-issues"
6328+
},
6329+
{
6330+
"source_path": "kubernetes-fleet/crp-clusterresourceplacementscheduled-false.md",
6331+
"redirect_url": "/troubleshoot/azure/kubernetes-fleet/cluster-resource-placement/crp-clusterresourceplacementscheduled-false"
6332+
},
6333+
{
6334+
"source_path": "kubernetes-fleet/crp-clusterresourceplacementrolloutstarted-false.md",
6335+
"redirect_url": "/troubleshoot/azure/kubernetes-fleet/cluster-resource-placement/crp-clusterresourceplacementrolloutstarted-false"
6336+
},
6337+
{
6338+
"source_path": "kubernetes-fleet/crp-clusterresourceplacementoverridden-false.md",
6339+
"redirect_url": "/troubleshoot/azure/kubernetes-fleet/cluster-resource-placement/crp-clusterresourceplacementoverridden-false"
6340+
},
6341+
{
6342+
"source_path": "kubernetes-fleet/crp-clusterresourceplacementworksynchronized-false.md",
6343+
"redirect_url": "/troubleshoot/azure/kubernetes-fleet/cluster-resource-placement/crp-clusterresourceplacementworksynchronized-false"
6344+
},
6345+
{
6346+
"source_path": "kubernetes-fleet/crp-clusterresourceplacementapplied-false.md",
6347+
"redirect_url": "/troubleshoot/azure/kubernetes-fleet/cluster-resource-placement/crp-clusterresourceplacementapplied-false"
6348+
},
6349+
{
6350+
"source_path": "kubernetes-fleet/crp-clusterresourceplacementavailable-false.md",
6351+
"redirect_url": "/troubleshoot/azure/kubernetes-fleet/cluster-resource-placement/crp-clusterresourceplacementavailable-false"
6352+
},
6353+
{
6354+
"source_path": "hpc/batch/error-accountencryptionkeyunavailable.md",
6355+
"redirect_url": "/troubleshoot/azure/hpc/batch/welcome-hpc-batch"
6356+
63236357
}
63246358
]
63256359
}

support/azure/azure-kubernetes/availability-performance/identify-high-cpu-consuming-containers-aks.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
---
2-
title: Identify CPU saturation in AKS clusters
2+
title: Identify high CPU utilization in AKS clusters
33
description: Troubleshoot high CPU that the node and containers consume in an AKS cluster.
44
ms.date: 08/30/2024
55
ms.reviewer: chiragpa, v-weizhu
@@ -8,6 +8,9 @@ ms.custom: sap:Node/node pool availability and performance
88
---
99
# Troubleshoot high CPU usage in AKS clusters
1010

11+
> [!NOTE]
12+
> This article discusses high CPU utilization. In many situations, CPU Pressure Stall Information (PSI) metrics provide a more accurate indication of CPU Pressure than utilization alone. For more information, see [Troubleshoot CPU pressure in AKS clusters using PSI metrics](troubleshoot-node-cpu-pressure-psi.md).
13+
1114
High CPU usage is a symptom of one or more applications or processes that require so much CPU time that the performance or usability of the machine is impacted. High CPU usage can occur in many ways, but it's mostly caused by user configuration.
1215

1316
When a node in an [Azure Kubernetes Service (AKS)](/azure/aks/intro-kubernetes) cluster experiences high CPU usage, the applications running on it can experience degradation in performance and reliability. Applications or processes also become unstable, which may lead to issues beyond slow responses.

support/azure/azure-kubernetes/availability-performance/identify-memory-saturation-aks.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ ms.date: 08/30/2024
55
editor: v-jsitser
66
ms.reviewer: chiragpa, aritraghosh, v-leedennis
77
ms.service: azure-kubernetes-service
8-
#Customer intent: As an Azure Kubernetes user, I want to understand how to identify memory saturation in my Azure Kubernetes Service (AKS) clusters so that I don't experience service interruption or other memory saturation issues.
98
ms.custom: sap:Node/node pool availability and performance
109
---
1110
# Troubleshoot memory saturation in AKS clusters
Loading
96.5 KB
Loading
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
---
2+
title: Troubleshoot CPU Pressure in AKS Clusters Using PSI Metrics
3+
description: Provides troubleshoot guidance on CPU pressure using PSI metrics in an AKS cluster.
4+
ms.date: 05/21/2025
5+
ms.reviewer: aritraghosh, dafell, alvinli, v-weizhu
6+
ms.service: azure-kubernetes-service
7+
ms.custom: sap:Node/node pool availability and performance
8+
---
9+
# Troubleshoot CPU pressure in AKS clusters using PSI metrics
10+
11+
CPU pressure is a more accurate indicator of resource contention than traditional CPU utilization metrics. While high CPU usage shows resource consumption, it doesn't necessarily indicate performance problems. In an Azure Kubernetes Service (AKS) cluster, understanding CPU pressure through Pressure Stall Information (PSI) metrics helps identify true resource contention issues.
12+
13+
When a node in an AKS cluster experiences CPU pressure, applications might suffer from poor performance even when CPU utilization appears moderate. PSI metrics provide insight into actual resource contention by measuring task delays rather than just resource consumption.
14+
15+
This article helps you monitor CPU pressure using PSI metrics and provides best practices to resolve resource contention issues.
16+
17+
## Symptoms
18+
19+
The following table outlines the common symptoms of CPU pressure:
20+
21+
|Symptom | Description |
22+
|---|---|
23+
|Increased application latency|Services respond slower even when CPU utilization appears moderate.|
24+
|Throttled containers|Containers experience delays in processing despite having CPU resources available on the node.|
25+
|Degraded performance|Applications experience unpredictable performance variations that don't correlate with CPU usage percentages.|
26+
27+
## Troubleshooting checklist
28+
29+
To identify and resolve CPU pressure issues, follow these steps:
30+
31+
### Step 1: Enable and monitor PSI metrics
32+
33+
Use one of the following methods to access PSI metrics:
34+
35+
- In a web browser, use Azure Monitoring Managed Prometheus or other monitoring solution to query PSI metrics.
36+
- In a console, use the Kubernetes command-line tool (`kubectl`).
37+
38+
### [Browser](#tab/browser)
39+
40+
Azure Monitoring Managed Prometheus provides a way to monitor PSI metrics:
41+
42+
1. Enable Azure Monitoring Managed Prometheus for your AKS cluster by following the instructions in [Enable Prometheus and Grafana](/azure/azure-monitor/containers/kubernetes-monitoring-enable#enable-prometheus-and-grafana).
43+
44+
To enable customized scrape metrics for Prometheus, see [Scrape configs](/azure/azure-monitor/containers/prometheus-metrics-scrape-configuration#scrape-configs). We recommend setting `minumum ingestion profile` to `false` and `node-exporter` to `true`.
45+
46+
2. Navigate to the Azure Monitor workspace associated with the AKS cluster from the [Azure portal](https://portal.azure.com).
47+
48+
:::image type="content" source="media/troubleshoot-node-cpu-pressure-psi/configure-azure-monitor-for-containers.png" alt-text="Screenshot that shows how to navigate to the Azure Monitor workspace." lightbox="media/troubleshoot-node-cpu-pressure-psi/configure-azure-monitor-for-containers.png":::
49+
50+
3. Under **Monitoring**, select **Metrics**.
51+
52+
4. Select **Prometheus metrics** as the data source.
53+
54+
> [!NOTE]
55+
> To use the metrics, you need to enable them in Azure Monitoring Managed Prometheus. These metrics are exposed by Node Exporter or cAdvisor.
56+
57+
5. Query specific PSI metrics in Prometheus explorer:
58+
59+
- For node-level CPU pressure, use the `node_pressure_cpu_waiting_seconds_total` Prometheus Query Language (PromQL).
60+
61+
:::image type="content" source="media/troubleshoot-node-cpu-pressure-psi/node-level-cpu-pressure.png" alt-text="Screenshot that shows how to query node-level CPU pressure." lightbox="media/troubleshoot-node-cpu-pressure-psi/node-level-cpu-pressure.png":::
62+
63+
- For pod-level CPU pressure, use the `container_cpu_cfs_throttled_seconds_total` PromQL.
64+
65+
6. Calculate the PSI-some percentage (percentage of time at least one task is stalled on CPU):
66+
67+
`rate(node_pressure_cpu_waiting_seconds_total[5m]) * 100`
68+
69+
> [!NOTE]
70+
> Some of the container level metrics such as `container_pressure_cpu_waiting_seconds_total` and `container_pressure_cpu_stalled_seconds_total` aren't available in AKS as they're part of the Kubelet PSI feature gate that is in alpha state. AKS begins supporting the use of the feature when it reaches beta stage.
71+
72+
### [Command Line](#tab/command-line)
73+
74+
Access PSI metrics safely using kubectl without requiring Secure Shell (SSH) access:
75+
76+
1. Use kubernetes proxy and node metrics:
77+
78+
```bash
79+
# Start the kubernetes proxy in a separate terminal
80+
kubectl proxy
81+
82+
# Access node metrics API
83+
kubectl get --raw /apis/metrics.k8s.io/v1beta1/nodes
84+
```
85+
86+
2. For more detailed PSI metrics, use the `kubectl debug` feature to create a temporary debug pod:
87+
88+
```bash
89+
# Create a debug pod that mounts the host filesystem
90+
kubectl debug node/<node_name> -it --image=busybox
91+
92+
# Once inside the debug pod, check PSI metrics
93+
cat /host/proc/pressure/cpu
94+
```
95+
96+
Here's an example command output:
97+
98+
```output
99+
some avg10=0.00 avg60=0.00 avg300=0.00 total=0
100+
full avg10=0.00 avg60=0.00 avg300=0.00 total=0
101+
```
102+
103+
- The `some` line indicates the percentage of time at least one task is stalled on CPU.
104+
- The `full` line indicates the percentage of time all tasks are stalled on CPU.
105+
106+
---
107+
108+
### Step 2: Review best practices to prevent CPU pressure
109+
110+
Review the following table to learn how to implement best practices for avoiding CPU pressure:
111+
112+
| Best practice | Description |
113+
|---|---|
114+
|Focus on PSI metrics instead of utilization|Use PSI metrics as your primary indicator of resource contention rather than CPU utilization percentages. For more information, see [PSI - Pressure Stall Information](https://docs.kernel.org/accounting/psi.html).|
115+
|Identify pods utilizing the most CPU|Isolate the pods that are utilizing the most CPU and identify solutions to reduce pressure. For more information, see [Troubleshoot high CPU usage in AKS clusters](./identify-high-cpu-consuming-containers-aks.md).|
116+
|Minimize CPU limits|Consider removing CPU limits and rely on [Linux's Completely Fair Scheduler](https://docs.kernel.org/scheduler/sched-design-CFS.html) with CPU shares based on requests. For more information, see [Resource Management for Pods and Containers](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/).|
117+
|Use appropriate Quality of Service (QoS) classes|Set the right QoSclass for each pod based on its importance and contention sensitivity. For more information, see [Configure Quality of Service for Pods](https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/).|
118+
|Optimize pod placement|Use pod anti-affinity rules to avoid placing CPU-intensive workloads on the same nodes. For more information, see [Assigning Pods to Nodes](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/).|
119+
|Monitor for brief pressure spikes|Short pressure spikes can indicate issues even when average utilization appears acceptable. For more information, see [Resource metrics pipeline](https://kubernetes.io/docs/tasks/debug/debug-cluster/resource-metrics-pipeline/).|
120+
121+
## Key PSI metrics to monitor
122+
123+
> [!NOTE]
124+
> If a node's CPU usage is moderate but the containers on the node experience CFS throttling, increase the resource limits, or remove them and follow [Linux's Completely Fair Scheduler (CFS)](https://docs.kernel.org/scheduler/sched-design-CFS.html) algorithm.
125+
126+
### Node-level PSI metrics
127+
128+
- `node_pressure_cpu_waiting_seconds_total`: Cumulative time tasks wait for CPU.
129+
- `node_cpu_seconds_total`: Traditional CPU utilization for comparison.
130+
131+
### Container-level PSI indicators
132+
133+
- `container_cpu_cfs_throttled_periods_total`: The number of periods a container is throttled.
134+
- `container_cpu_cfs_throttled_seconds_total`: Total time a container is throttled.
135+
- Throttling percentage: `rate(container_cpu_cfs_throttled_periods_total[5m]) / rate(container_cpu_cfs_periods_total[5m]) * 100`
136+
137+
## Why using PSI metrics?
138+
139+
AKS uses PSI metrics as an indicator for CPU pressure instead of load average for several reasons:
140+
141+
- In oversized and multi-core nodes, load average often underreports CPU saturation.
142+
- On chattier and containerized nodes, load average can over-signal, leading to alert fatigue.
143+
- Since load average doesn't have per-cgroup visibility, noisy pods can hide behind a low system average.
144+
145+
## References
146+
147+
- [Linux PSI documentation](https://docs.kernel.org/accounting/psi.html)
148+
- [Kubernetes resource management](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/)
149+
- [AKS performance best practices](/azure/aks/concepts-clusters-workloads)
150+
- [Enable Prometheus and Grafana](/azure/azure-monitor/containers/kubernetes-monitoring-enable#enable-prometheus-and-grafana)
151+
- [Quality of Service in Kubernetes](https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/)
152+
- [Linux Completely Fair Scheduler](https://docs.kernel.org/scheduler/sched-design-CFS.html)
153+
154+
[!INCLUDE [Azure Help Support](../../../includes/azure-help-support.md)]

support/azure/azure-kubernetes/create-upgrade-delete/error-code-customprivatednszonemissingpermissionerror.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
---
22
title: CustomPrivateDNSZoneMissingPermissionError error code
33
description: Learn how to fix the CustomPrivateDNSZoneMissingPermissionError error that occurs when you try to create or update an Azure Kubernetes Service (AKS) cluster.
4-
ms.date: 11/28/2023
4+
ms.date: 05/27/2025
55
author: jotavar
66
ms.author: jotavar
77
editor: v-jsitser
@@ -28,7 +28,11 @@ An AKS cluster create or update operation fails and returns the following error
2828
2929
## Cause
3030

31-
Before AKS runs a cluster create or update operation for a private cluster that uses a [custom private DNS zone](/azure/aks/private-clusters#configure-a-private-dns-zone), it checks whether the cluster's managed identity or service principal has the required permissions to control the private DNS zone. If AKS doesn't find the necessary permissions, it blocks the operation so that the cluster doesn't enter a failed state.
31+
Before AKS runs a cluster create or update operation for a private cluster that uses a [custom private DNS zone](/azure/aks/private-clusters#configure-a-private-dns-zone), it checks whether the cluster's managed identity or service principal has the required permissions to control the private DNS zone. If AKS can't find the necessary permissions in cases like the following ones, it blocks the operation so that the cluster doesn't enter a failed state:
32+
33+
- The managed identity or service principal has been deleted.
34+
- The managed identity or service principal has been re-created with the same name.
35+
- An incorrect managed identity is passed.
3236

3337
## Solution
3438

0 commit comments

Comments
 (0)