排查适用于 SQL Server 的 Azure 扩展的问题

适用范围:SQL Server

请使用 Azure Resource Graph 来识别在启用 Azure Arc 的服务器上,SQL Server 的 Azure 扩展的状态。 本文演示标识不正常的扩展的查询。

提示

如果尚不熟悉,请先了解 Azure Resource Graph:

识别不正常的扩展

可以在 Azure 门户中创建仪表板,以显示 SQL Server 的所有已部署 Azure 扩展的运行状况。

提示

使用此文件从 sql-server-samples GitHub 存储库创建自己的仪表板:已启用 Arc 的 SQL Server Health.json

查询运行不正常的扩展

此查询返回已安装扩展但运行不正常的服务器上的 SQL Server 实例。

resources
| where type == "microsoft.hybridcompute/machines/extensions" 
| where properties.type in ("WindowsAgent.SqlServer", "LinuxAgent.SqlServer") 
| extend targetMachineName = tolower(tostring(split(id, '/')[8])) // Extract the machine name from the extension's id
| join kind=leftouter (
    resources
    | where type == "microsoft.hybridcompute/machines"
    | project machineId = id, MachineName = name, subscriptionId, LowerMachineName = tolower(name), resourceGroup , MachineStatus= properties.status , MachineProvisioningStatus= properties.provisioningState, MachineErrors = properties.errorDetails //Project relevant machine health information.
) on $left.targetMachineName == $right.LowerMachineName and $left.resourceGroup == $right.resourceGroup and $left.subscriptionId == $right.subscriptionId // Join Based on MachineName in the id and the machine's name, the resource group, and the subscription. This join allows us to present the data of the machine as well as the extension in the final output.
| extend statusExpirationLengthRange = 3d // Change this value to change the acceptable range for the last time an extension should have reported its status.
| extend startDate = startofday(now() - statusExpirationLengthRange), endDate = startofday(now()) // Get the start and end position for the given range.
| extend extractedDateString = extract("timestampUTC : (\\d{4}\\W\\d{2}\\W\\d{2})", 1, tostring(properties.instanceView.status.message)) // Extracting the date string for the LastUploadTimestamp. Is empty if none is found.
| extend extractedDateStringYear = split(extractedDateString, '/')[0], extractedDateStringMonth = split(extractedDateString, '/')[1], extractedDateStringDay = split(extractedDateString, '/')[2] // Identifying each of the parts of the date that was extracted from the message.
| extend extractedDate = todatetime(strcat(extractedDateStringYear,"-",extractedDateStringMonth,"-",extractedDateStringDay,"T00:00:00Z")) // Converting to a datetime object and rewriting string into ISO format because todatetime() does not work using the previous format.
| extend isNotInDateRange = not(extractedDate >= startDate and extractedDate <= endDate) // Created bool which is true if the date we extracted from the message is not within the specified range. This bool will also be true if the date was not found in the message.
| where properties.instanceView.status.message !contains "SQL Server Extension Agent: Healthy" // Begin searching for unhealthy extensions using the following 1. Does extension report being healthy. 2. Is last upload within the given range. 3. Is the upload status in an OK state. 4. Is provisioning state not in a succeeded state.
    or isNotInDateRange
    or properties.instanceView.status.message !contains "uploadStatus : OK"
    or properties.provisioningState != "Succeeded"
    or MachineStatus != "Connected"
| extend FailureReasons = strcat( // Makes a String to list all the reason that this resource got flagged for
        iif(MachineStatus != "Connected",strcat("- Machine's status is ", MachineStatus," -"),"") ,
        iif(MachineErrors != "[]","- Machine reports errors -", ""),
        iif(properties.instanceView.status.message !contains "SQL Server Extension Agent: Healthy","- Extension reported unhealthy -",""), 
        iif(isNotInDateRange,"- Last upload outside acceptable range -",""),
        iif(properties.instanceView.status.message !contains "uploadStatus : OK","- Upload status is not reported OK -",""), 
        iif(properties.provisioningState != "Succeeded",strcat("- Extension provisiong state is ", properties.provisioningState," -"),"") 
    )
| extend RecommendedAction = //Attempt to Identify RootCause based on information gathered, and point customer to what they should investigate first.
    iif(MachineStatus == "Disconnected", "Machine is disconnected. Please reconnect the machine.",
        iif(MachineStatus == "Expired", "Machine cert is expired. Go to the machine on the Azure portal for more information on how to resolve this issue.",
            iif(MachineStatus != "Connected", strcat("Machine status is ", MachineStatus,". Investigate and resolve this issue."),
                iif(MachineProvisioningStatus != "Succeeded", strcat("Machine provisioning status is ", MachineProvisioningStatus, ". Investigate and resolve machine provisioning status"),
                    iff(MachineErrors != "[]", "Machine is reporting errors. Investigate and resolve machine errors",
                        iif(properties.provisioningState != "Succeeded", strcat("Extension provisioning status is ", properties.provisioningState,". Investigate and resolve extension provisioning state."),
                            iff(properties.instanceView.status.message !contains "SQL Server Extension Agent:" and properties.instanceView.status.message contains "SQL Server Extension Agent Deployer", "SQL Server extension employer ran. However, SQL Server extension seems to not be running. Verify that the extension is currently running.",
                                iff(properties.instanceView.status.message !contains "uploadStatus : OK" or isNotInDateRange or properties.instanceView.status.message !contains "SQL Server Extension Agent: Healthy", "Extension reported as unhealthy. View FailureReasons and LastExtensionStatusMessage for more information as to the cause of the failure.",
                                    "Unable to recommend actions. Please view FailureReasons."
                                )
                            )
                        )
                    )
                )
            )
        )
    )
| project ID = id, MachineName, ResourceGroup = resourceGroup, SubscriptionID = subscriptionId, Location = location, RecommendedAction, FailureReasons, LicenseType = properties.settings.LicenseType, 
    LastReportedExtensionHealth = iif(properties.instanceView.status.message !contains "SQL Server Extension Agent: Healthy", "Unhealthy", "Healthy"),
    LastExtensionUploadTimestamp = iif(indexof(properties.instanceView.status.message, "timestampUTC : ") > 0,
        substring(properties.instanceView.status.message, indexof(properties.instanceView.status.message, "timestampUTC : ") + 15, 10),
        "no timestamp"),
    LastExtensionUploadStatus = iif(indexof(properties.instanceView.status.message, "uploadStatus : OK") > 0, "OK", "Unhealthy"),
    ExtensionProvisioningState = properties.provisioningState,
    MachineStatus, MachineErrors, MachineProvisioningStatus,MachineId = machineId,
    LastExtensionStatusMessage = properties.instanceView.status.message

如需识别可能的问题,请查看 建议措施失败原因 列中的值。 RecommendedAction 列提供了解决问题可能需要采取的首要步骤或需要首先检查的线索。 FailureReasons 列会列出资源被视为运行不正常的原因。 最后,检查 LastExtensionStatusMessage 以查看代理最后报告的消息

疑难解答指南

建议的操作 操作详情
计算机证书已过期。

若要详细了解如何解决此问题,请转到 Azure 门户上的计算机。
已启用 Arc 的计算机必须重新载入 Arc,因为用于向 Azure 进行身份验证的证书已过期。 在 Azure 门户中,Arc 计算机状态为“已过期”。 按照此处的文档卸载代理,然后在此处重新加入。 如果重新载入,则无需在门户中删除已启用 Arc 的 SQL Server 资源。 只要启用自动配置(默认),就会再次自动安装 SQL 扩展。
计算机已断开连接。

重新连接计算机。
Arc 计算机位于 state = Disconnected 中。 此状态可能有多种原因:

已连接 Arc 的计算机代理已停止、禁用或不断崩溃



代理与 Azure 之间的连接被阻止。

检查已连接 Arc 的计算机服务/守护程序的状态,确保启用并运行

检查连接

使用详细日志对代理进行故障排除
扩展报告为运行不正常。

有关失败原因的详细信息,请查看 FailureReasons 和 LastExtensionStatusMessage。

上次上传超出可接受的范围(在过去三天内)。
检查 LastExtensionUploadTimestamp 列。 如果 没有时间戳,则它永远不会向 Azure 报告清单或使用情况数据。 排查从 SQL 扩展到 Azure 的连接问题

如果最后一个上传超出可接受的范围(在过去三天内),其他所有内容都看起来正常,例如 LastExtensionUploadStatusExtensionProvisioningStateMachineStatus,则 Arc SQL 扩展服务/守护程序可能会停止。 找出它为何停止并再次启动它。 检查 LastExtensionStatusMessage 是否有有关该问题的任何其他线索。
扩展配置状态 失败

调查并解决扩展预配状态。
SQL 扩展的初始安装或更新失败。 检查部署器和扩展日志

检查 LastExtensionStatusMessage中的值。
上传状态未报告正常 检查仪表板中的 LastExtensionMessage 列,并查看 uploadStatus 值和 uploadMessage 值(如果存在,具体取决于版本)。

uploadStatus 值通常是 HTTP 错误代码。 查看排查错误代码

uploadMessage 可能有更具体的信息。 常规 Arc SQL 扩展连接故障排除
扩展预配状态为“正在更新”



扩展预配状态为“正在创建”



扩展预配状态 失败



扩展预配状态为“正在删除”
如果给定的扩展保持其中一个状态超过 30 分钟,则预配可能存在问题。 卸载该扩展,然后使用 CLI 或门户重新安装它。 如果问题仍然存在,检查部署器和扩展日志

如果删除失败,请尝试卸载代理,并根据需要在门户中删除 Arc 计算机资源,然后重新部署。

按照此处的文档卸载代理,然后在此处重新加入。

识别不正常的扩展 (PowerShell)

此示例在 PowerShell 中运行。 该示例将返回与上一个查询相同的结果,但通过 PowerShell 脚本返回。

# PowerShell script to execute an Azure Resource Graph query using Azure CLI
# where the extension status is unhealthy or the extension last upload time isn't in this month or the previous month.

# Requires the Az.ResourceGraph PowerShell module

# Login to Azure if needed
#az login

# Define the Azure Resource Graph query
$query = @"
resources
| where type == "microsoft.hybridcompute/machines/extensions" 
| where properties.type in ("WindowsAgent.SqlServer", "LinuxAgent.SqlServer") 
| extend targetMachineName = tolower(tostring(split(id, '/')[8])) // Extract the machine name from the extension's id
| join kind=leftouter (
    resources
    | where type == "microsoft.hybridcompute/machines"
    | project machineId = id, MachineName = name, subscriptionId, LowerMachineName = tolower(name), resourceGroup , MachineStatus= properties.status , MachineProvisioningStatus= properties.provisioningState, MachineErrors = properties.errorDetails //Project relevant machine health information.
) on $left.targetMachineName == $right.LowerMachineName and $left.resourceGroup == $right.resourceGroup and $left.subscriptionId == $right.subscriptionId // Join Based on MachineName in the id and the machine's name, the resource group, and the subscription. This join allows us to present the data of the machine as well as the extension in the final output.
| extend statusExpirationLengthRange = 3d // Change this value to change the acceptable range for the last time an extension should have reported its status.
| extend startDate = startofday(now() - statusExpirationLengthRange), endDate = startofday(now()) // Get the start and end position for the given range.
| extend extractedDateString = extract("timestampUTC : (\\d{4}\\W\\d{2}\\W\\d{2})", 1, tostring(properties.instanceView.status.message)) // Extracting the date string for the LastUploadTimestamp. Is empty if none is found.
| extend extractedDateStringYear = split(extractedDateString, '/')[0], extractedDateStringMonth = split(extractedDateString, '/')[1], extractedDateStringDay = split(extractedDateString, '/')[2] // Identifying each of the parts of the date that was extracted from the message.
| extend extractedDate = todatetime(strcat(extractedDateStringYear,"-",extractedDateStringMonth,"-",extractedDateStringDay,"T00:00:00Z")) // Converting to a datetime object and rewriting string into ISO format because todatetime() does not work using the previous format.
| extend isNotInDateRange = not(extractedDate >= startDate and extractedDate <= endDate) // Created bool which is true if the date we extracted from the message is not within the specified range. This bool will also be true if the date was not found in the message.
| where properties.instanceView.status.message !contains "SQL Server Extension Agent: Healthy" // Begin searching for unhealthy extensions using the following 1. Does extension report being healthy. 2. Is last upload within the given range. 3. Is the upload status in an OK state. 4. Is provisioning state not in a succeeded state.
    or isNotInDateRange
    or properties.instanceView.status.message !contains "uploadStatus : OK"
    or properties.provisioningState != "Succeeded"
    or MachineStatus != "Connected"
| extend FailureReasons = strcat( // Makes a String to list all the reason that this resource got flagged for
        iif(MachineStatus != "Connected",strcat("- Machine's status is ", MachineStatus," -"),"") ,
        iif(MachineErrors != "[]","- Machine reports errors -", ""),
        iif(properties.instanceView.status.message !contains "SQL Server Extension Agent: Healthy","- Extension reported unhealthy -",""), 
        iif(isNotInDateRange,"- Last upload outside acceptable range -",""),
        iif(properties.instanceView.status.message !contains "uploadStatus : OK","- Upload status is not reported OK -",""), 
        iif(properties.provisioningState != "Succeeded",strcat("- Extension provisiong state is ", properties.provisioningState," -"),"") 
    )
| extend RecommendedAction = //Attempt to Identify RootCause based on information gathered, and point customer to what they should investigate first.
    iif(MachineStatus == "Disconnected", "Machine is disconnected. Please reconnect the machine.",
        iif(MachineStatus == "Expired", "Machine cert is expired. Go to the machine on the Azure portal for more information on how to resolve this issue.",
            iif(MachineStatus != "Connected", strcat("Machine status is ", MachineStatus,". Investigate and resolve this issue."),
                iif(MachineProvisioningStatus != "Succeeded", strcat("Machine provisioning status is ", MachineProvisioningStatus, ". Investigate and resolve machine provisioning status"),
                    iff(MachineErrors != "[]", "Machine is reporting errors. Investigate and resolve machine errors",
                        iif(properties.provisioningState != "Succeeded", strcat("Extension provisioning status is ", properties.provisioningState,". Investigate and resolve extension provisioning state."),
                            iff(properties.instanceView.status.message !contains "SQL Server Extension Agent:" and properties.instanceView.status.message contains "SQL Server Extension Agent Deployer", "SQL Server extension employer ran. However, SQL Server extension seems to not be running. Verify that the extension is currently running.",
                                iff(properties.instanceView.status.message !contains "uploadStatus : OK" or isNotInDateRange or properties.instanceView.status.message !contains "SQL Server Extension Agent: Healthy", "Extension reported as unhealthy. View FailureReasons and LastExtensionStatusMessage for more information as to the cause of the failure.",
                                    "Unable to recommend actions. Please view FailureReasons."
                                )
                            )
                        )
                    )
                )
            )
        )
    )
| project ID = id, MachineName, ResourceGroup = resourceGroup, SubscriptionID = subscriptionId, Location = location, RecommendedAction, FailureReasons, LicenseType = properties.settings.LicenseType, 
    LastReportedExtensionHealth = iif(properties.instanceView.status.message !contains "SQL Server Extension Agent: Healthy", "Unhealthy", "Healthy"),
    LastExtensionUploadTimestamp = iif(indexof(properties.instanceView.status.message, "timestampUTC : ") > 0,
        substring(properties.instanceView.status.message, indexof(properties.instanceView.status.message, "timestampUTC : ") + 15, 10),
        "no timestamp"),
    LastExtensionUploadStatus = iif(indexof(properties.instanceView.status.message, "uploadStatus : OK") > 0, "OK", "Unhealthy"),
    ExtensionProvisioningState = properties.provisioningState,
    MachineStatus, MachineErrors, MachineProvisioningStatus,MachineId = machineId,
    LastExtensionStatusMessage = properties.instanceView.status.message
"@

# Execute the Azure Resource Graph query
$result = Search-AzGraph -Query $query

# Output the results
$result | Format-Table -Property ExtensionHealth, LastUploadTimestamp, LastUploadStatus, Message

如需识别可能的问题,请查看 建议措施失败原因 列中的值。 RecommendedAction 列提供了解决问题可能需要采取的首要步骤或需要首先检查的线索。 FailureReasons 列会列出资源被视为运行不正常的原因。 最后,检查 LastExtensionStatusMessage 以查看代理最后报告的消息

识别缺少更新的扩展

识别没有最新状态更新的扩展。 此查询按扩展上次更新状态的天数排序,返回 SQL Server 的 Azure 扩展列表。 值为“-1”表示扩展崩溃,并且扩展状态中存在调用堆栈。

// Show the timestamp extracted
// If an extension has crashed (i.e. no heartbeat), fill timestamp with "1900/01/01, 00:00:00.000"
//
resources
| where type =~ 'microsoft.hybridcompute/machines/extensions'
| extend extensionStatus = parse_json(properties).instanceView.status.message
| extend timestampExtracted = extract(@"timestampUTC\s*:\s*(\d{4}/\d{2}/\d{2}, \d{2}:\d{2}:\d{2}\.\d{3})", 1, tostring(extensionStatus))
| extend timestampNullFilled = iff(isnull(timestampExtracted) or timestampExtracted == "", "1900/01/01, 00:00:00.000", timestampExtracted)
| extend timestampKustoFormattedString = strcat(replace(",", "", replace("/", "-", replace("/", "-", timestampNullFilled))), "Z")
| extend agentHeartbeatUtcTimestamp = todatetime(timestampKustoFormattedString)
| extend agentHeartbeatLagInDays = datetime_diff('day', now(), agentHeartbeatUtcTimestamp)
| project id, extensionStatus, agentHeartbeatUtcTimestamp, agentHeartbeatLagInDays
| limit 100
| order by ['agentHeartbeatLagInDays'] asc

此查询按扩展名上次更新状态的天数分组,返回扩展计数。 值为“-1”表示扩展崩溃,并且扩展状态中存在调用堆栈。

// Aggregate by timestamp
//
// -1: Crashed extension with no heartbeat, we got a stacktrace instead
//  0: Healthy
// >1: Stale/Offline
//
resources
| where type =~ 'microsoft.hybridcompute/machines/extensions'
| extend extensionStatus = parse_json(properties).instanceView.status.message
| extend timestampExtracted = extract(@"timestampUTC\s*:\s*(\d{4}/\d{2}/\d{2}, \d{2}:\d{2}:\d{2}\.\d{3})", 1, tostring(extensionStatus))
| extend timestampNullFilled = iff(isnull(timestampExtracted) or timestampExtracted == "", "1900/01/01, 00:00:00.000", timestampExtracted)
| extend timestampKustoFormattedString = strcat(replace(",", "", replace("/", "-", replace("/", "-", timestampNullFilled))), "Z")
| extend agentHeartbeatUtcTimestamp = todatetime(timestampKustoFormattedString)
| extend agentHeartbeatLagInDays = iff(agentHeartbeatUtcTimestamp == todatetime("1900/01/01, 00:00:00.000Z"), -1, datetime_diff('day', now(), agentHeartbeatUtcTimestamp))
| summarize numExtensions = count() by agentHeartbeatLagInDays
| order by numExtensions desc