Nomad v0.11.1 (b43457070037800fcc8442c8ff095ff4005dab33)
Ubuntu 18.04
Docker 19.03.8
VM in Azure
When registering csi volume in nomad ->
Error registering volume: Unexpected response code: 500 (validate volume: Volume validation failed)
Started job for controller and node and they appeared as healthy
$ nomad plugin status az-disk0
ID = az-disk0
Provider = disk.csi.azure.com
Version = v0.8.0
Controllers Healthy = 1
Controllers Expected = 1
Nodes Healthy = 1
Nodes Expected = 1
Allocations
ID Node ID Task Group Version Desired Status Created Modified
222461f0 69aa2a40 controller 4 run running 3m39s ago 3m28s ago
dc040d8e 69aa2a40 nodes 4 run running 3m5s ago 3m4s ago
volume register file (volume.hcl)
id = "azure_prod_db1"
name = "database"
type = "csi"
external_id = "/subscriptions/xxxxxxxx-1234-5678-1234-agsbetsdsjde12/resourceGroups/rgtest/providers/Microsoft.Compute/disks/managed_disk_1"
plugin_id = "az-disk0"
access_mode = "single-node-writer"
attachment_mode = "file-system"
Controller job
job "plugin-azure-disk-controller" {
datacenters = ["dev1"]
group "controller" {
task "plugin" {
driver = "docker"
template {
change_mode = "noop"
destination = "local/azure.json"
data = <<EOH
{
"cloud":"AzurePublicCloud",
"tenantId": "my tenant",
"subscriptionId": "my subscription",
"aadClientId": "my clientid",
"aadClientSecret": "my secret",
"resourceGroup": "test-core-rg",
"location": "westeurope",
"vmType": "standard",
"subnetName": "test-core-subnet",
"securityGroupName": "internalproj-core-subnet-nsg",
"vnetName": "main-network",
"vnetResourceGroup": "management",
"routeTableName": "route-table",
"primaryAvailabilitySetName": "agentpool-availabilitySet-17181929",
"primaryScaleSetName": "",
"cloudProviderBackoff": true,
"cloudProviderBackoffRetries": 6,
"cloudProviderBackoffExponent": 1.5,
"cloudProviderBackoffDuration": 5,
"cloudProviderBackoffJitter": 1,
"cloudProviderRatelimit": false,
"cloudProviderRateLimitQPS": 3,
"cloudProviderRateLimitBucket": 10,
"useManagedIdentityExtension": false,
"userAssignedIdentityID": "",
"useInstanceMetadata": true,
"loadBalancerSku": "Basic",
"excludeMasterFromStandardLB": false,
"providerVaultName": "",
"providerKeyName": "k8s",
"providerKeyVersion": ""
}
EOH
}
env {
AZURE_CREDENTIAL_FILE = "/etc/kubernetes/azure.json"
}
config {
image = "mcr.microsoft.com/k8s/csi/azuredisk-csi"
volumes = [
"local/azure.json:/etc/kubernetes/azure.json"
]
args = [
"--nodeid=${attr.unique.hostname}",
"--endpoint=unix://csi/csi.sock",
"--logtostderr",
"--v=5",
]
}
csi_plugin {
id = "az-disk0"
type = "controller"
mount_dir = "/csi"
}
resources {
cpu = 500
memory = 256
}
}
}
}
Node job
job "plugin-azure-disk-nodes" {
datacenters = ["dev1"]
# you can run node plugins as service jobs as well, but this ensures
# that all nodes in the DC have a copy.
type = "system"
group "nodes" {
task "plugin" {
driver = "docker"
template {
change_mode = "noop"
destination = "local/azure.json"
data = <<EOH
{
"cloud":"AzurePublicCloud",
"tenantId": "my tenant",
"subscriptionId": "my subscription",
"aadClientId": "my clientid",
"aadClientSecret": "my secret",
"resourceGroup": "test-core-rg",
"location": "westeurope",
"vmType": "standard",
"subnetName": "test-core-subnet",
"securityGroupName": "internalproj-core-subnet-nsg",
"vnetName": "main-network",
"vnetResourceGroup": "management",
"routeTableName": "route-table",
"primaryAvailabilitySetName": "agentpool-availabilitySet-17181929",
"primaryScaleSetName": "",
"cloudProviderBackoff": true,
"cloudProviderBackoffRetries": 6,
"cloudProviderBackoffExponent": 1.5,
"cloudProviderBackoffDuration": 5,
"cloudProviderBackoffJitter": 1,
"cloudProviderRatelimit": false,
"cloudProviderRateLimitQPS": 3,
"cloudProviderRateLimitBucket": 10,
"useManagedIdentityExtension": false,
"userAssignedIdentityID": "",
"useInstanceMetadata": true,
"loadBalancerSku": "Basic",
"excludeMasterFromStandardLB": false,
"providerVaultName": "",
"providerKeyName": "k8s",
"providerKeyVersion": ""
}
EOH
}
env {
AZURE_CREDENTIAL_FILE = "/etc/kubernetes/azure.json"
}
config {
image = "mcr.microsoft.com/k8s/csi/azuredisk-csi"
volumes = [
"local/azure.json:/etc/kubernetes/azure.json"
]
args = [
"--nodeid=${attr.unique.hostname}",
"--endpoint=unix://csi/csi.sock",
"--logtostderr",
"--v=5",
]
# node plugins must run as privileged jobs because they
# mount disks to the host
privileged = true
}
csi_plugin {
id = "az-disk0"
type = "node"
mount_dir = "/csi"
}
resources {
cpu = 500
memory = 256
}
}
}
}
There is only one example for AWS-EBS so I ported what I understood to Azure configuration.
I cannot understand what is my problem registering the volume. The error is very vague and I could not find any documentation that explained this behaviour. Can you please help me ?
Archive.zip
Nomad server logs (the only relevant entry
2020-04-27T15:33:06.188Z [ERROR] http: request failed: method=PUT path=/v1/volume/csi/mysql error="validate volume: Volume validation failed" code=500
If possible please post relevant logs in the issue.
Logs and other artifacts may also be sent to: [email protected]
Please link to your Github issue in the email and reference it in the subject
line:
Subject: GH-1234: Errors garbage collecting allocs
Emails sent to that address are readable by all HashiCorp employees but are not publicly visible.
Hi @carlosrbcunha! Thanks for reporting this!
The error messages are definitely not great. We're hoping to improve that with https://github.com/hashicorp/nomad/issues/7424.
But the error means that when Nomad tried to register the volume, the plugin rejected the volume definition. I see you provided the logs of the plugins, so lemme dig into that a bit more to see if I can see what the problem is. With some other plugins we've typically seen problems like the wrong ID or wrong permissions for the plugin to query the cloud provider.
I used the same credentials for every steps up to and including creating the disk in azure with terraform as stated in the example.
The id used came directly from the terraform output. Is there any way that I can make any log more verbose and drill down on this ?
Unfortunately not. You're hitting this bit on the client. So when the Nomad client is getting a response back from the plugin, there's no error message associated with it. The portion of the CSI spec is ValidateVolumeCapabilities, showing the error message is optional. So that's allowed but not very helpful on the part of the plugin.
I'm looking at the controller.logs file and not seeing the RPC call we make to the controller listed. Can you see if there's an error log on the Nomad client where the controller is running (look for the word grpc) for validating the volume?
Can you please instruct me in how to get that log ? I am running all in one test server (server and client). I must be doing something wrong..
Oh! If it's the same instance of Nomad, the logs will be mixed together. We record errors for RPC to the plugins at the error level, but maybe "invalid volume" RPC isn't being treated as an error by plugin? Setting the log level for Nomad to debug will show the RPCs to the plugin that succeed; let's see if they're in there.
The logs Carlos sent me earlier, with the debug log level:
2020-04-27T17:18:23.068Z [INFO] client: node registration complete
2020-04-27T17:20:33.500Z [DEBUG] http: request complete: method=GET path=/v1/plugins?type=csi duration=1.46327ms
2020-04-27T17:20:38.315Z [DEBUG] http: request complete: method=GET path=/v1/plugin/csi/az-disk0 duration=431.621碌s
2020-04-27T17:20:50.166Z [ERROR] http: request failed: method=PUT path=/v1/volume/csi/mysql error="validate volume: Volume validation failed" code=500
2020-04-27T17:20:50.166Z [DEBUG] http: request complete: method=PUT path=/v1/volume/csi/mysql duration=427.464827ms
I'm working another issue with ValidateVolume in the Linode plugin in https://github.com/hashicorp/nomad/issues/7743#issuecomment-621216593, and that inspired me to go look at the Azure plugin's code for the same RPC: controllerserver.go#L473.
The Azure plugin isn't setting the Confirm field either, which suggests that we're possibly interpreting the spec wrong in Nomad on this field. Our tests with the hostpath, AWS EBS, and AWS EFS plugins didn't encounter the same problem. I'm going to do some digging into how k8s handles this code path and circle back here shortly.
After a pass thru k8s and the spec I'm seeing we are incorrectly validating this response. The spec says:
Confirmed indicates to the CO the set of capabilities that the
plugin has validated. This field SHALL only be set to a non-empty
value for successful validation responses.
Which means that if the plugin has validated the capabilities we should be checking to make sure they match what we expect, but if the plugin _doesn't_ validate them that's not actually an error condition. It just means the plugin doesn't care to give us a response. I might not have written the spec that way but it's definitely a bug in Nomad. Should be a straightforward fix.
I've opened https://github.com/hashicorp/nomad/pull/7831 with a patch to fix the validation.
The master branch now has the patch for the volume validation. I don't have an Azure setup handy at the moment but if you don't get a chance to try it, I'll take a crack at getting it tested myself sometime early next week.
Success! This will ship in 0.11.2 shortly.
$ nomad volume status azure_test
ID = azure_test_1
Name = database
External ID = /subscriptions/REDACTED/resourceGroups/nomad-testing/providers/Microsoft.Compute/disks/nomad-test-csi-vol1
Plugin ID = az-disk0
Provider = disk.csi.azure.com
Version = v0.8.0
Schedulable = false
Controllers Healthy = 1
Controllers Expected = 1
Nodes Healthy = 0
Nodes Expected = 0
Access Mode = single-node-writer
Attachment Mode = file-system
Mount Options = <none>
Namespace = default
Allocations
No allocations placed
By the way, it looks like most of that big k8s creds block isn't needed. The jobs I ran were as follows:
controller.nomad
job "plugin-azure-disk-controller" {
datacenters = ["dc1"]
group "controller" {
task "plugin" {
driver = "docker"
template {
change_mode = "noop"
destination = "local/azure.json"
data = <<EOH
{
"cloud":"AzurePublicCloud",
"tenantId": "REDACTED",
"subscriptionId": "REDACTED",
"aadClientId": "REDACTED",
"aadClientSecret": "REDACTED",
"resourceGroup": "nomad-testing",
"location": "eastus",
}
EOH
}
env {
AZURE_CREDENTIAL_FILE = "/etc/kubernetes/azure.json"
}
config {
image = "mcr.microsoft.com/k8s/csi/azuredisk-csi"
volumes = [
"local/azure.json:/etc/kubernetes/azure.json",
]
args = [
"--nodeid=${attr.unique.hostname}",
"--endpoint=unix://csi/csi.sock",
"--logtostderr",
"--v=5",
]
}
csi_plugin {
id = "az-disk0"
type = "controller"
mount_dir = "/csi"
}
resources {
cpu = 500
memory = 256
}
}
}
}
node.nomad
job "plugin-azure-disk-nodes" {
datacenters = ["dev1"]
# you can run node plugins as service jobs as well, but this ensures
# that all nodes in the DC have a copy.
type = "system"
group "nodes" {
task "plugin" {
driver = "docker"
template {
change_mode = "noop"
destination = "local/azure.json"
data = <<EOH
{
"cloud":"AzurePublicCloud",
"tenantId": "REDACTED",
"subscriptionId": "REDACTED",
"aadClientId": "REDACTED",
"aadClientSecret": "REDACTED",
"resourceGroup": "nomad-testing",
"location": "eastus",
}
EOH
}
env {
AZURE_CREDENTIAL_FILE = "/etc/kubernetes/azure.json"
}
config {
image = "mcr.microsoft.com/k8s/csi/azuredisk-csi"
volumes = [
"local/azure.json:/etc/kubernetes/azure.json",
]
args = [
"--nodeid=${attr.unique.hostname}",
"--endpoint=unix://csi/csi.sock",
"--logtostderr",
"--v=5",
]
# node plugins must run as privileged jobs because they
# mount disks to the host
privileged = true
}
csi_plugin {
id = "az-disk0"
type = "node"
mount_dir = "/csi"
}
resources {
cpu = 500
memory = 256
}
}
}
}