Nomad: inproper ip adress registration when addres mode 'driver'

Created on 20 Dec 2017  路  15Comments  路  Source: hashicorp/nomad

Nomad version

Nomad v0.7.1 (0b295d399d00199cfab4621566babd25987ba06e)

We have follow job specification:

        config
        {
            image = "playrix/zabbix-proxy-sqllite:3.4.4"
            load = "playrix-zabbix-proxy-sqllite-3.4.4.tar.gz"

            network_mode = "macvlan212"
            ipv4_address = "172.16.12.3"
            dns_servers = ["172.16.9.99"]

            command = "/sbin/init_plrx"
            args = ["-c", "/usr/bin/runsvdir -P /etc/service/"]

            volume_driver = "resmon"
            volumes = [
                "zabbixproxy:/var/lib/zabbixproxy"
            ]
        }

        env
        {
            ZBXSERVER = "rootzabbixserver.service.atf01.consul",
            ZBXPROXYNAME = "vol-zabbix-proxy",
            ZBXDATADIR = "/var/lib/zabbixproxy"
        }

        service
        {
            name = "zabbixserver"
            address_mode = "driver"
        }

prior upgrade to 0.7.1 nomad properly register driver ip adress for service, for now we got that nomad register host ip address, not driver

themnetworking typbug

Most helpful comment

@schmichael Cool! I doesn't have anouph english skill to express fully my admiration of your work. We will try a fix as soon as possible 1-2 days,

All 15 comments

Oh no! I'm sorry upgrading to 0.7.1 broke this.

  • What version were you running before?
  • What version of docker are you running?
  • Can you share the output of docker inspect $container_id?

We upgraded from version 0.6.3, and use docker Docker version 17.09.0-ce, build afdb6d4

root@vol-h-docker-02:/home/ruslan# docker inspect dd453cfc4df8
[
    {
        "Id": "dd453cfc4df8ba4f393828e11a81a07114942e28ef4c8c21b27974e9675cb405",
        "Created": "2017-12-20T21:00:51.533913042Z",
        "Path": "/sbin/init_plrx",
        "Args": [
            "-c",
            "/usr/bin/runsvdir -P /etc/service/"
        ],
        "State": {
            "Status": "running",
            "Running": true,
            "Paused": false,
            "Restarting": false,
            "OOMKilled": false,
            "Dead": false,
            "Pid": 3010,
            "ExitCode": 0,
            "Error": "",
            "StartedAt": "2017-12-20T21:00:58.561563438Z",
            "FinishedAt": "0001-01-01T00:00:00Z"
        },
        "Image": "sha256:7f7d32d94af96288b8659fb3d153cce19208fc3bf0d9fc51b0158902f77f7959",
        "ResolvConfPath": "/var/lib/docker/containers/dd453cfc4df8ba4f393828e11a81a07114942e28ef4c8c21b27974e9675cb405/resolv.conf",
        "HostnamePath": "/var/lib/docker/containers/dd453cfc4df8ba4f393828e11a81a07114942e28ef4c8c21b27974e9675cb405/hostname",
        "HostsPath": "/var/lib/docker/containers/dd453cfc4df8ba4f393828e11a81a07114942e28ef4c8c21b27974e9675cb405/hosts",
        "LogPath": "",
        "Name": "/zabbixproxy-3a9be4df-5d33-de04-5d59-e44c4ec737c1",
        "RestartCount": 0,
        "Driver": "aufs",
        "MountLabel": "",
        "ProcessLabel": "",
        "AppArmorProfile": "docker-default",
        "ExecIDs": null,
        "HostConfig": {
            "Binds": [
                "/var/lib/nomad/alloc/3a9be4df-5d33-de04-5d59-e44c4ec737c1/alloc:/alloc",
                "/var/lib/nomad/alloc/3a9be4df-5d33-de04-5d59-e44c4ec737c1/zabbixproxy/local:/local",
                "/var/lib/nomad/alloc/3a9be4df-5d33-de04-5d59-e44c4ec737c1/zabbixproxy/secrets:/secrets",
                "zabbixproxy:/var/lib/zabbixproxy"
            ],
            "ContainerIDFile": "",
            "LogConfig": {
                "Type": "syslog",
                "Config": {
                    "syslog-address": "unix:///tmp/plugin415184535"
                }
            },
            "NetworkMode": "macvlan212",
            "PortBindings": null,
            "RestartPolicy": {
                "Name": "",
                "MaximumRetryCount": 0
            },
            "AutoRemove": false,
            "VolumeDriver": "resmon",
            "VolumesFrom": null,
            "CapAdd": null,
            "CapDrop": null,
            "Dns": [
                "172.16.9.99"
            ],
            "DnsOptions": null,
            "DnsSearch": null,
            "ExtraHosts": null,
            "GroupAdd": null,
            "IpcMode": "",
            "Cgroup": "",
            "Links": null,
            "OomScoreAdj": 0,
            "PidMode": "",
            "Privileged": false,
            "PublishAllPorts": false,
            "ReadonlyRootfs": false,
            "SecurityOpt": null,
            "UTSMode": "",
            "UsernsMode": "",
            "ShmSize": 67108864,
            "Runtime": "runc",
            "ConsoleSize": [
                0,
                0
            ],
            "Isolation": "",
            "CpuShares": 1500,
            "Memory": 943718400,
            "NanoCpus": 0,
            "CgroupParent": "",
            "BlkioWeight": 0,
            "BlkioWeightDevice": null,
            "BlkioDeviceReadBps": null,
            "BlkioDeviceWriteBps": null,
            "BlkioDeviceReadIOps": null,
            "BlkioDeviceWriteIOps": null,
            "CpuPeriod": 0,
            "CpuQuota": 0,
            "CpuRealtimePeriod": 0,
            "CpuRealtimeRuntime": 0,
            "CpusetCpus": "",
            "CpusetMems": "",
            "Devices": null,
            "DeviceCgroupRules": null,
            "DiskQuota": 0,
            "KernelMemory": 0,
            "MemoryReservation": 0,
            "MemorySwap": -1,
            "MemorySwappiness": 0,
            "OomKillDisable": false,
            "PidsLimit": 0,
            "Ulimits": null,
            "CpuCount": 0,
            "CpuPercent": 0,
            "IOMaximumIOps": 0,
            "IOMaximumBandwidth": 0
        },
        "GraphDriver": {
            "Data": null,
            "Name": "aufs"
        },
        "Mounts": [
            {
                "Type": "bind",
                "Source": "/var/lib/nomad/alloc/3a9be4df-5d33-de04-5d59-e44c4ec737c1/alloc",
                "Destination": "/alloc",
                "Mode": "",
                "RW": true,
                "Propagation": ""
            },
            {
                "Type": "bind",
                "Source": "/var/lib/nomad/alloc/3a9be4df-5d33-de04-5d59-e44c4ec737c1/zabbixproxy/local",
                "Destination": "/local",
                "Mode": "",
                "RW": true,
                "Propagation": ""
            },
            {
                "Type": "bind",
                "Source": "/var/lib/nomad/alloc/3a9be4df-5d33-de04-5d59-e44c4ec737c1/zabbixproxy/secrets",
                "Destination": "/secrets",
                "Mode": "",
                "RW": true,
                "Propagation": ""
            },
            {
                "Type": "volume",
                "Name": "zabbixproxy",
                "Source": "/var/lib/rbd/volumes/zabbixproxy",
                "Destination": "/var/lib/zabbixproxy",
                "Driver": "resmon",
                "Mode": "",
                "RW": true,
                "Propagation": ""
            }
        ],
        "Config": {
            "Hostname": "dd453cfc4df8",
            "Domainname": "",
            "User": "",
            "AttachStdin": false,
            "AttachStdout": false,
            "AttachStderr": false,
            "Tty": false,
            "OpenStdin": false,
            "StdinOnce": false,
            "Env": [
                "NOMAD_GROUP_NAME=zabbixproxy",
                "NOMAD_TASK_NAME=zabbixproxy",
                "NOMAD_ALLOC_ID=3a9be4df-5d33-de04-5d59-e44c4ec737c1",
                "NOMAD_DC=test",
                "NOMAD_MEMORY_LIMIT=900",
                "NOMAD_ALLOC_INDEX=0",
                "ZBXSERVER=rootzabbixserver.service.atf01.consul",
                "NOMAD_ALLOC_NAME=zabbixproxy.zabbixproxy[0]",
                "NOMAD_REGION=global",
                "ZBXPROXYNAME=vol-zabbix-proxy",
                "NOMAD_TASK_DIR=/local",
                "NOMAD_CPU_LIMIT=1500",
                "NOMAD_SECRETS_DIR=/secrets",
                "NOMAD_ALLOC_DIR=/alloc",
                "NOMAD_JOB_NAME=zabbixproxy",
                "ZBXDATADIR=/var/lib/zabbixproxy"
            ],
            "Cmd": [
                "/sbin/init_plrx",
                "-c",
                "/usr/bin/runsvdir -P /etc/service/"
            ],
            "Image": "sha256:7f7d32d94af96288b8659fb3d153cce19208fc3bf0d9fc51b0158902f77f7959",
            "Volumes": null,
            "WorkingDir": "",
            "Entrypoint": null,
            "OnBuild": null,
            "Labels": {}
        },
        "NetworkSettings": {
            "Bridge": "",
            "SandboxID": "dac3fb08acc9ed5f2131292a625737fa7e5221a41b6f91042aed097fa747c0da",
            "HairpinMode": false,
            "LinkLocalIPv6Address": "",
            "LinkLocalIPv6PrefixLen": 0,
            "Ports": {},
            "SandboxKey": "/var/run/docker/netns/dac3fb08acc9",
            "SecondaryIPAddresses": null,
            "SecondaryIPv6Addresses": null,
            "EndpointID": "",
            "Gateway": "",
            "GlobalIPv6Address": "",
            "GlobalIPv6PrefixLen": 0,
            "IPAddress": "",
            "IPPrefixLen": 0,
            "IPv6Gateway": "",
            "MacAddress": "",
            "Networks": {
                "macvlan212": {
                    "IPAMConfig": {
                        "IPv4Address": "172.16.12.3"
                    },
                    "Links": null,
                    "Aliases": [
                        "dd453cfc4df8"
                    ],
                    "NetworkID": "8c41c8c5cd0f8095f926ab15e326026568fc57bc60ae5e52cef0d752ff00073d",
                    "EndpointID": "7cf294048265a9c3a7926578086bd01f53371b4edf722d9aa9438a21626873eb",
                    "Gateway": "172.16.12.1",
                    "IPAddress": "172.16.12.3",
                    "IPPrefixLen": 24,
                    "IPv6Gateway": "",
                    "GlobalIPv6Address": "",
                    "GlobalIPv6PrefixLen": 0,
                    "MacAddress": "02:42:ac:10:0c:03"
                }
            }
        }
    }
]

For now we revert all nomad agents back to version 0.6.3, and manualy remove all service registration which left after nomad 0.7.1 via consul api. There was epic fail :-)

@tantra35 And just to be clear 172.16.12.3 was not advertised? The host's IP was?

@schmichael yes. For example before upgrade zabbixserver.service.consul was resolved to 172.16.12.3, after upgrade if begins resolved to 172.16.9.2 - host where that container was placed

also when we cleanup consul from nomad 0.7.1 garbage service registrations we see follow:

root@vol-h-docker-01:/home/ruslan# python ./test.py
{u'Node': u'vol-h-docker-01', u'Datacenter': u'test', u'CreateIndex': 45549647, u'ServiceName': u'zabbixserver', u'TaggedAddresses': {u'wan': u'172.16.9.2', u'lan': u'172.16.9.2'}, u'ModifyIndex': 45549679, u'ServicePort': 0, u'ServiceID': u'_nomad-task-esy5prk576ju2bw5atxoca5wzba6g3rb', u'ServiceAddress': u'', u'Address': u'172.16.9.2', u'ServiceTags': [], u'NodeMeta': {u'consul-network-segment': u''}, u'ServiceEnableTagOverride': False, u'ID': u'09cebb81-a1f3-48c6-dea2-7b250af4281d'}
{u'Node': u'vol-h-docker-02', u'Datacenter': u'test', u'CreateIndex': 45550302, u'ServiceName': u'zabbixserver', u'TaggedAddresses': {u'wan': u'172.16.9.199', u'lan': u'172.16.9.199'}, u'ModifyIndex': 45550351, u'ServicePort': 0, u'ServiceID': u'_nomad-executor-3a9be4df-5d33-de04-5d59-e44c4ec737c1-zabbixproxy-zabbixserver', u'ServiceAddress': u'172.16.12.3', u'Address': u'172.16.9.199', u'ServiceTags': [], u'NodeMeta': {u'consul-network-segment': u''}, u'ServiceEnableTagOverride': False, u'ID': u'00000000-0000-0000-0000-0cc47ab62b5a'}

ServiceAddress for service registration which was created by nomad 0.7.1 was empty. Another buggy service regisration have the same empty ServiceAddress field

@tantra35 Ah! I see the issue. You're not specifying a port. Prior to 0.6 if a port wasn't specified we'd simply use the IP and set the port to 0.

In 0.7.1 rc1 I required ports to be set. This broke backward compatibility and #3673 was filed.

We decided unset ports should be allowed, and I attempted to revert to pre-0.7.1-rc1 behavior in PR #3674.

However, when a port isn't specified, I don't register the IP either! That still breaks backward compatibility.

Workaround

While we're discussing the proper fix you can workaround this by specifying a non-zero port for the service. You don't even have to create a port label. The service can literally be:

        service
        {
            name = "zabbixserver"
            address_mode = "driver"

            # set any non-zero port to get the IP to register
            port = "1"
        }

Hm, we were thinked about port, by panic force us to revert to previous version. And when we read disscussion GH-3673 I think that not presented port in job descriptions will not cause any issue due this:

However after hearing your use case (and discussing some other potential uses I'm sure others' are using), we've decided to allow services without ports. I'll get a PR up ASAP and it will be included in 0.7.1 final.

And thanks to workaround we will try to upgrade again

Just updated the PR again with some logging improvements and included #3680 in the binaries if you have time to test.

@schmichael Cool! I doesn't have anouph english skill to express fully my admiration of your work. We will try a fix as soon as possible 1-2 days,

@schmichael we investigate this issue from our side and inspect all our jobs and found one with this description (i omitt some job details):

config
{
    image = "playrix/elasticsearch:2.4.6-po01"
    load = "playrix-elasticsearch-2.4.6-po01.tar.gz"

    command = "/startup.sh"
    network_mode = "macvlan212"

    ipv4_address = "172.16.12.2"
    dns_servers = ["172.16.9.99"]

    volume_driver = "resmon"
    volumes = [
        "elasticsearch:/home/elasticsearch"
    ]
}

service
{
    name = "elasticsearch"
    address_mode = "driver"
    port = "appport"
}

resources
{
    memory = 4096
    cpu = 1000

    network
    {
        mbits = 10

        port "appport"
        {
           static = "9200"
        }

        port "jmx"
        {
           static = "9288"
        }
    }
}

but in this case elasticsearch services do not register, in any case. Only after we apply workaround with face port, functionality of service was restored. As i understend this is not intended behoivour, because port fomaly was declared

We just now tested yours binary and something strange happens. For some time some services disappear from dns consul zone (this happens not for all services). And after 5 minutes they appear again, in nomad client logs not any info about this

service declarations for service was absolutely legal:

        service
        {
            name = "zabbixfrontend"
            port = "appport"
            check
            {
                name = "alive"
                type = "tcp"
                interval = "10s"
                timeout = "2s"
                initial_status = "critical"
            }
        }

on serverside we see folow in logs:

Dec 21 14:29:50 consulnomad-01 nomad.sh[18322]: ==> Caught signal: interrupt
Dec 21 14:29:50 consulnomad-01 nomad.sh[18322]:     2017/12/21 14:29:50 [WARN] serf: Shutdown without a Leave
Dec 21 14:29:50 consulnomad-01 systemd[1]: Stopping Nomad agent...
Dec 21 14:29:50 consulnomad-01 nomad.sh[18322]:     2017/12/21 14:29:50 [ERR] raft-net: Failed to decode incoming command: transport shutdown
Dec 21 14:29:50 consulnomad-01 nomad.sh[18322]:     2017/12/21 14:29:50 [ERR] raft-net: Failed to decode incoming command: transport shutdown
Dec 21 14:29:50 consulnomad-01 nomad.sh[18322]:     2017/12/21 14:29:50.551586 [ERR] consul.sync: error deregistering agent service (id: "f5eacdbcaf552d5916571357b791871f154dafc7"): Unexpected response code: 500 (Unknown check "f5eacdbcaf552d5916571357b791871f154dafc7")
Dec 21 14:29:50 consulnomad-01 nomad.sh[18322]:     2017/12/21 14:29:50.552174 [ERR] consul.sync: error deregistering agent service (id: "4f3f89f79df795b9da5edee4a127c64bd9c556d2"): Unexpected response code: 500 (Unknown check "4f3f89f79df795b9da5edee4a127c64bd9c556d2")
Dec 21 14:29:50 consulnomad-01 nomad.sh[18322]:     2017/12/21 14:29:50.552721 [ERR] consul.sync: error deregistering agent service (id: "93b2c5fd25cbc9f404cca585bcfe9d56cac49675"): Unexpected response code: 500 (Unknown check "93b2c5fd25cbc9f404cca585bcfe9d56cac49675")
Dec 21 14:29:50 consulnomad-01 systemd[1]: nomad.service: Main process exited, code=exited, status=1/FAILURE
Dec 21 14:29:50 consulnomad-01 systemd[1]: Stopped Nomad agent.
Dec 21 14:29:50 consulnomad-01 systemd[1]: nomad.service: Unit entered failed state.
Dec 21 14:29:50 consulnomad-01 systemd[1]: nomad.service: Failed with result 'exit-code'.
Dec 21 14:29:50 consulnomad-01 systemd[1]: Started Nomad agent.
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]:     Loaded configuration from /etc/nomad/nomad.json
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]: ==> Starting Nomad agent...
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]: ==> Nomad agent configuration:
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]:                 Client: false
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]:              Log Level: WARN
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]:                 Region: atf01 (DC: test)
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]:                 Server: true
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]:                Version: 0.8.0-dev
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]: ==> Nomad agent started! Log data will stream in below:

I don't think that this problem of this custom binary, but is in all nomad 0.7.1 branch

@schmichael In this jobs

job "zabbixproxy"
{

    ...............

    task "zabbixproxy"
    {
        driver = "docker"
        kill_timeout = "1m"

        ..............

        config
        {
            image = "playrix/zabbix-proxy-sqllite:3.4.4"
            load = "playrix-zabbix-proxy-sqllite-3.4.4.tar.gz"

            network_mode = "host"

            command = "/sbin/init_plrx"
            args = ["-c", "/usr/bin/runsvdir -P /etc/service/"]
        }

        ........        

        service
        {
            name = "zabbixserver"
            address_mode = "driver"
            port = "appport"
        }

        ........

        resources
        {

          ............... 

            network
            {
                mbits = 10
                port "appport"
                {
                    static = "10051"
                }
            }
        }
    }
}

When we specify port in service definition we got follow error:

01/12/18 18:42:38 STD  Driver Failure         unable to get address for service "zabbixserver": invalid port "appport": strconv.Atoi: parsing "appport": invalid syntax

and job goes to dead state. Only when network_mode is default i.e. bridge - symbolic references on port still working

That is an unfortunate error message that should be improved. You should not use service.address_mode = "driver" with network_mode = "host" as the driver won't have any network information in host network mode so you want to let the service use the host network which should properly resolve the port label.

Sorry it's gotten so complicated. I will try to improve the error message to at least point you in the right direction!

I pushed an improvement to the error message in #3682 to hopefully help make debugging this easier.

If you remove address_mode=driver from the service it should work.

This is still an issue for IPv6 to the point where it completely overrides any IPv6 network settings you have on Docker @schmichael

Was this page helpful?
0 / 5 - 0 ratings