Nomad v0.7.1 (0b295d399d00199cfab4621566babd25987ba06e)
We have follow job specification:
config
{
image = "playrix/zabbix-proxy-sqllite:3.4.4"
load = "playrix-zabbix-proxy-sqllite-3.4.4.tar.gz"
network_mode = "macvlan212"
ipv4_address = "172.16.12.3"
dns_servers = ["172.16.9.99"]
command = "/sbin/init_plrx"
args = ["-c", "/usr/bin/runsvdir -P /etc/service/"]
volume_driver = "resmon"
volumes = [
"zabbixproxy:/var/lib/zabbixproxy"
]
}
env
{
ZBXSERVER = "rootzabbixserver.service.atf01.consul",
ZBXPROXYNAME = "vol-zabbix-proxy",
ZBXDATADIR = "/var/lib/zabbixproxy"
}
service
{
name = "zabbixserver"
address_mode = "driver"
}
prior upgrade to 0.7.1 nomad properly register driver ip adress for service, for now we got that nomad register host ip address, not driver
Oh no! I'm sorry upgrading to 0.7.1 broke this.
docker inspect $container_id?We upgraded from version 0.6.3, and use docker Docker version 17.09.0-ce, build afdb6d4
root@vol-h-docker-02:/home/ruslan# docker inspect dd453cfc4df8
[
{
"Id": "dd453cfc4df8ba4f393828e11a81a07114942e28ef4c8c21b27974e9675cb405",
"Created": "2017-12-20T21:00:51.533913042Z",
"Path": "/sbin/init_plrx",
"Args": [
"-c",
"/usr/bin/runsvdir -P /etc/service/"
],
"State": {
"Status": "running",
"Running": true,
"Paused": false,
"Restarting": false,
"OOMKilled": false,
"Dead": false,
"Pid": 3010,
"ExitCode": 0,
"Error": "",
"StartedAt": "2017-12-20T21:00:58.561563438Z",
"FinishedAt": "0001-01-01T00:00:00Z"
},
"Image": "sha256:7f7d32d94af96288b8659fb3d153cce19208fc3bf0d9fc51b0158902f77f7959",
"ResolvConfPath": "/var/lib/docker/containers/dd453cfc4df8ba4f393828e11a81a07114942e28ef4c8c21b27974e9675cb405/resolv.conf",
"HostnamePath": "/var/lib/docker/containers/dd453cfc4df8ba4f393828e11a81a07114942e28ef4c8c21b27974e9675cb405/hostname",
"HostsPath": "/var/lib/docker/containers/dd453cfc4df8ba4f393828e11a81a07114942e28ef4c8c21b27974e9675cb405/hosts",
"LogPath": "",
"Name": "/zabbixproxy-3a9be4df-5d33-de04-5d59-e44c4ec737c1",
"RestartCount": 0,
"Driver": "aufs",
"MountLabel": "",
"ProcessLabel": "",
"AppArmorProfile": "docker-default",
"ExecIDs": null,
"HostConfig": {
"Binds": [
"/var/lib/nomad/alloc/3a9be4df-5d33-de04-5d59-e44c4ec737c1/alloc:/alloc",
"/var/lib/nomad/alloc/3a9be4df-5d33-de04-5d59-e44c4ec737c1/zabbixproxy/local:/local",
"/var/lib/nomad/alloc/3a9be4df-5d33-de04-5d59-e44c4ec737c1/zabbixproxy/secrets:/secrets",
"zabbixproxy:/var/lib/zabbixproxy"
],
"ContainerIDFile": "",
"LogConfig": {
"Type": "syslog",
"Config": {
"syslog-address": "unix:///tmp/plugin415184535"
}
},
"NetworkMode": "macvlan212",
"PortBindings": null,
"RestartPolicy": {
"Name": "",
"MaximumRetryCount": 0
},
"AutoRemove": false,
"VolumeDriver": "resmon",
"VolumesFrom": null,
"CapAdd": null,
"CapDrop": null,
"Dns": [
"172.16.9.99"
],
"DnsOptions": null,
"DnsSearch": null,
"ExtraHosts": null,
"GroupAdd": null,
"IpcMode": "",
"Cgroup": "",
"Links": null,
"OomScoreAdj": 0,
"PidMode": "",
"Privileged": false,
"PublishAllPorts": false,
"ReadonlyRootfs": false,
"SecurityOpt": null,
"UTSMode": "",
"UsernsMode": "",
"ShmSize": 67108864,
"Runtime": "runc",
"ConsoleSize": [
0,
0
],
"Isolation": "",
"CpuShares": 1500,
"Memory": 943718400,
"NanoCpus": 0,
"CgroupParent": "",
"BlkioWeight": 0,
"BlkioWeightDevice": null,
"BlkioDeviceReadBps": null,
"BlkioDeviceWriteBps": null,
"BlkioDeviceReadIOps": null,
"BlkioDeviceWriteIOps": null,
"CpuPeriod": 0,
"CpuQuota": 0,
"CpuRealtimePeriod": 0,
"CpuRealtimeRuntime": 0,
"CpusetCpus": "",
"CpusetMems": "",
"Devices": null,
"DeviceCgroupRules": null,
"DiskQuota": 0,
"KernelMemory": 0,
"MemoryReservation": 0,
"MemorySwap": -1,
"MemorySwappiness": 0,
"OomKillDisable": false,
"PidsLimit": 0,
"Ulimits": null,
"CpuCount": 0,
"CpuPercent": 0,
"IOMaximumIOps": 0,
"IOMaximumBandwidth": 0
},
"GraphDriver": {
"Data": null,
"Name": "aufs"
},
"Mounts": [
{
"Type": "bind",
"Source": "/var/lib/nomad/alloc/3a9be4df-5d33-de04-5d59-e44c4ec737c1/alloc",
"Destination": "/alloc",
"Mode": "",
"RW": true,
"Propagation": ""
},
{
"Type": "bind",
"Source": "/var/lib/nomad/alloc/3a9be4df-5d33-de04-5d59-e44c4ec737c1/zabbixproxy/local",
"Destination": "/local",
"Mode": "",
"RW": true,
"Propagation": ""
},
{
"Type": "bind",
"Source": "/var/lib/nomad/alloc/3a9be4df-5d33-de04-5d59-e44c4ec737c1/zabbixproxy/secrets",
"Destination": "/secrets",
"Mode": "",
"RW": true,
"Propagation": ""
},
{
"Type": "volume",
"Name": "zabbixproxy",
"Source": "/var/lib/rbd/volumes/zabbixproxy",
"Destination": "/var/lib/zabbixproxy",
"Driver": "resmon",
"Mode": "",
"RW": true,
"Propagation": ""
}
],
"Config": {
"Hostname": "dd453cfc4df8",
"Domainname": "",
"User": "",
"AttachStdin": false,
"AttachStdout": false,
"AttachStderr": false,
"Tty": false,
"OpenStdin": false,
"StdinOnce": false,
"Env": [
"NOMAD_GROUP_NAME=zabbixproxy",
"NOMAD_TASK_NAME=zabbixproxy",
"NOMAD_ALLOC_ID=3a9be4df-5d33-de04-5d59-e44c4ec737c1",
"NOMAD_DC=test",
"NOMAD_MEMORY_LIMIT=900",
"NOMAD_ALLOC_INDEX=0",
"ZBXSERVER=rootzabbixserver.service.atf01.consul",
"NOMAD_ALLOC_NAME=zabbixproxy.zabbixproxy[0]",
"NOMAD_REGION=global",
"ZBXPROXYNAME=vol-zabbix-proxy",
"NOMAD_TASK_DIR=/local",
"NOMAD_CPU_LIMIT=1500",
"NOMAD_SECRETS_DIR=/secrets",
"NOMAD_ALLOC_DIR=/alloc",
"NOMAD_JOB_NAME=zabbixproxy",
"ZBXDATADIR=/var/lib/zabbixproxy"
],
"Cmd": [
"/sbin/init_plrx",
"-c",
"/usr/bin/runsvdir -P /etc/service/"
],
"Image": "sha256:7f7d32d94af96288b8659fb3d153cce19208fc3bf0d9fc51b0158902f77f7959",
"Volumes": null,
"WorkingDir": "",
"Entrypoint": null,
"OnBuild": null,
"Labels": {}
},
"NetworkSettings": {
"Bridge": "",
"SandboxID": "dac3fb08acc9ed5f2131292a625737fa7e5221a41b6f91042aed097fa747c0da",
"HairpinMode": false,
"LinkLocalIPv6Address": "",
"LinkLocalIPv6PrefixLen": 0,
"Ports": {},
"SandboxKey": "/var/run/docker/netns/dac3fb08acc9",
"SecondaryIPAddresses": null,
"SecondaryIPv6Addresses": null,
"EndpointID": "",
"Gateway": "",
"GlobalIPv6Address": "",
"GlobalIPv6PrefixLen": 0,
"IPAddress": "",
"IPPrefixLen": 0,
"IPv6Gateway": "",
"MacAddress": "",
"Networks": {
"macvlan212": {
"IPAMConfig": {
"IPv4Address": "172.16.12.3"
},
"Links": null,
"Aliases": [
"dd453cfc4df8"
],
"NetworkID": "8c41c8c5cd0f8095f926ab15e326026568fc57bc60ae5e52cef0d752ff00073d",
"EndpointID": "7cf294048265a9c3a7926578086bd01f53371b4edf722d9aa9438a21626873eb",
"Gateway": "172.16.12.1",
"IPAddress": "172.16.12.3",
"IPPrefixLen": 24,
"IPv6Gateway": "",
"GlobalIPv6Address": "",
"GlobalIPv6PrefixLen": 0,
"MacAddress": "02:42:ac:10:0c:03"
}
}
}
}
]
For now we revert all nomad agents back to version 0.6.3, and manualy remove all service registration which left after nomad 0.7.1 via consul api. There was epic fail :-)
@tantra35 And just to be clear 172.16.12.3 was not advertised? The host's IP was?
@schmichael yes. For example before upgrade zabbixserver.service.consul was resolved to 172.16.12.3, after upgrade if begins resolved to 172.16.9.2 - host where that container was placed
also when we cleanup consul from nomad 0.7.1 garbage service registrations we see follow:
root@vol-h-docker-01:/home/ruslan# python ./test.py
{u'Node': u'vol-h-docker-01', u'Datacenter': u'test', u'CreateIndex': 45549647, u'ServiceName': u'zabbixserver', u'TaggedAddresses': {u'wan': u'172.16.9.2', u'lan': u'172.16.9.2'}, u'ModifyIndex': 45549679, u'ServicePort': 0, u'ServiceID': u'_nomad-task-esy5prk576ju2bw5atxoca5wzba6g3rb', u'ServiceAddress': u'', u'Address': u'172.16.9.2', u'ServiceTags': [], u'NodeMeta': {u'consul-network-segment': u''}, u'ServiceEnableTagOverride': False, u'ID': u'09cebb81-a1f3-48c6-dea2-7b250af4281d'}
{u'Node': u'vol-h-docker-02', u'Datacenter': u'test', u'CreateIndex': 45550302, u'ServiceName': u'zabbixserver', u'TaggedAddresses': {u'wan': u'172.16.9.199', u'lan': u'172.16.9.199'}, u'ModifyIndex': 45550351, u'ServicePort': 0, u'ServiceID': u'_nomad-executor-3a9be4df-5d33-de04-5d59-e44c4ec737c1-zabbixproxy-zabbixserver', u'ServiceAddress': u'172.16.12.3', u'Address': u'172.16.9.199', u'ServiceTags': [], u'NodeMeta': {u'consul-network-segment': u''}, u'ServiceEnableTagOverride': False, u'ID': u'00000000-0000-0000-0000-0cc47ab62b5a'}
ServiceAddress for service registration which was created by nomad 0.7.1 was empty. Another buggy service regisration have the same empty ServiceAddress field
@tantra35 Ah! I see the issue. You're not specifying a port. Prior to 0.6 if a port wasn't specified we'd simply use the IP and set the port to 0.
In 0.7.1 rc1 I required ports to be set. This broke backward compatibility and #3673 was filed.
We decided unset ports should be allowed, and I attempted to revert to pre-0.7.1-rc1 behavior in PR #3674.
However, when a port isn't specified, I don't register the IP either! That still breaks backward compatibility.
While we're discussing the proper fix you can workaround this by specifying a non-zero port for the service. You don't even have to create a port label. The service can literally be:
service
{
name = "zabbixserver"
address_mode = "driver"
# set any non-zero port to get the IP to register
port = "1"
}
Hm, we were thinked about port, by panic force us to revert to previous version. And when we read disscussion GH-3673 I think that not presented port in job descriptions will not cause any issue due this:
However after hearing your use case (and discussing some other potential uses I'm sure others' are using), we've decided to allow services without ports. I'll get a PR up ASAP and it will be included in 0.7.1 final.
And thanks to workaround we will try to upgrade again
Just updated the PR again with some logging improvements and included #3680 in the binaries if you have time to test.
@schmichael Cool! I doesn't have anouph english skill to express fully my admiration of your work. We will try a fix as soon as possible 1-2 days,
@schmichael we investigate this issue from our side and inspect all our jobs and found one with this description (i omitt some job details):
config
{
image = "playrix/elasticsearch:2.4.6-po01"
load = "playrix-elasticsearch-2.4.6-po01.tar.gz"
command = "/startup.sh"
network_mode = "macvlan212"
ipv4_address = "172.16.12.2"
dns_servers = ["172.16.9.99"]
volume_driver = "resmon"
volumes = [
"elasticsearch:/home/elasticsearch"
]
}
service
{
name = "elasticsearch"
address_mode = "driver"
port = "appport"
}
resources
{
memory = 4096
cpu = 1000
network
{
mbits = 10
port "appport"
{
static = "9200"
}
port "jmx"
{
static = "9288"
}
}
}
but in this case elasticsearch services do not register, in any case. Only after we apply workaround with face port, functionality of service was restored. As i understend this is not intended behoivour, because port fomaly was declared
We just now tested yours binary and something strange happens. For some time some services disappear from dns consul zone (this happens not for all services). And after 5 minutes they appear again, in nomad client logs not any info about this
service declarations for service was absolutely legal:
service
{
name = "zabbixfrontend"
port = "appport"
check
{
name = "alive"
type = "tcp"
interval = "10s"
timeout = "2s"
initial_status = "critical"
}
}
on serverside we see folow in logs:
Dec 21 14:29:50 consulnomad-01 nomad.sh[18322]: ==> Caught signal: interrupt
Dec 21 14:29:50 consulnomad-01 nomad.sh[18322]: 2017/12/21 14:29:50 [WARN] serf: Shutdown without a Leave
Dec 21 14:29:50 consulnomad-01 systemd[1]: Stopping Nomad agent...
Dec 21 14:29:50 consulnomad-01 nomad.sh[18322]: 2017/12/21 14:29:50 [ERR] raft-net: Failed to decode incoming command: transport shutdown
Dec 21 14:29:50 consulnomad-01 nomad.sh[18322]: 2017/12/21 14:29:50 [ERR] raft-net: Failed to decode incoming command: transport shutdown
Dec 21 14:29:50 consulnomad-01 nomad.sh[18322]: 2017/12/21 14:29:50.551586 [ERR] consul.sync: error deregistering agent service (id: "f5eacdbcaf552d5916571357b791871f154dafc7"): Unexpected response code: 500 (Unknown check "f5eacdbcaf552d5916571357b791871f154dafc7")
Dec 21 14:29:50 consulnomad-01 nomad.sh[18322]: 2017/12/21 14:29:50.552174 [ERR] consul.sync: error deregistering agent service (id: "4f3f89f79df795b9da5edee4a127c64bd9c556d2"): Unexpected response code: 500 (Unknown check "4f3f89f79df795b9da5edee4a127c64bd9c556d2")
Dec 21 14:29:50 consulnomad-01 nomad.sh[18322]: 2017/12/21 14:29:50.552721 [ERR] consul.sync: error deregistering agent service (id: "93b2c5fd25cbc9f404cca585bcfe9d56cac49675"): Unexpected response code: 500 (Unknown check "93b2c5fd25cbc9f404cca585bcfe9d56cac49675")
Dec 21 14:29:50 consulnomad-01 systemd[1]: nomad.service: Main process exited, code=exited, status=1/FAILURE
Dec 21 14:29:50 consulnomad-01 systemd[1]: Stopped Nomad agent.
Dec 21 14:29:50 consulnomad-01 systemd[1]: nomad.service: Unit entered failed state.
Dec 21 14:29:50 consulnomad-01 systemd[1]: nomad.service: Failed with result 'exit-code'.
Dec 21 14:29:50 consulnomad-01 systemd[1]: Started Nomad agent.
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]: Loaded configuration from /etc/nomad/nomad.json
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]: ==> Starting Nomad agent...
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]: ==> Nomad agent configuration:
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]: Client: false
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]: Log Level: WARN
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]: Region: atf01 (DC: test)
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]: Server: true
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]: Version: 0.8.0-dev
Dec 21 14:29:50 consulnomad-01 nomad.sh[20948]: ==> Nomad agent started! Log data will stream in below:
I don't think that this problem of this custom binary, but is in all nomad 0.7.1 branch
@schmichael In this jobs
job "zabbixproxy"
{
...............
task "zabbixproxy"
{
driver = "docker"
kill_timeout = "1m"
..............
config
{
image = "playrix/zabbix-proxy-sqllite:3.4.4"
load = "playrix-zabbix-proxy-sqllite-3.4.4.tar.gz"
network_mode = "host"
command = "/sbin/init_plrx"
args = ["-c", "/usr/bin/runsvdir -P /etc/service/"]
}
........
service
{
name = "zabbixserver"
address_mode = "driver"
port = "appport"
}
........
resources
{
...............
network
{
mbits = 10
port "appport"
{
static = "10051"
}
}
}
}
}
When we specify port in service definition we got follow error:
01/12/18 18:42:38 STD Driver Failure unable to get address for service "zabbixserver": invalid port "appport": strconv.Atoi: parsing "appport": invalid syntax
and job goes to dead state. Only when network_mode is default i.e. bridge - symbolic references on port still working
That is an unfortunate error message that should be improved. You should not use service.address_mode = "driver" with network_mode = "host" as the driver won't have any network information in host network mode so you want to let the service use the host network which should properly resolve the port label.
Sorry it's gotten so complicated. I will try to improve the error message to at least point you in the right direction!
I pushed an improvement to the error message in #3682 to hopefully help make debugging this easier.
If you remove address_mode=driver from the service it should work.
This is still an issue for IPv6 to the point where it completely overrides any IPv6 network settings you have on Docker @schmichael
Most helpful comment
@schmichael Cool! I doesn't have anouph english skill to express fully my admiration of your work. We will try a fix as soon as possible 1-2 days,