While attempting to debug an issue with timeouts in Envoy I've discovered that Envoy is routing HTTP requests to services which are failing their active health checks.
We have two VMs. Both VMs provide a greeter service: a simple Flask app to greet users by their name. We refer to these two greeters as greeter 1 and greeter 2:
from flask import Flask
from flask import jsonify
app = Flask(__name__)
@app.route('/greet/<name>')
def greet(name):
return jsonify({'message': 'greeter n: {}'.format(name)})
@app.route('/healthz')
def healthz():
return ''
app.run()
on VM 1:
curl -s 127.0.0.1:5000/greet/george | jq .
{
"message": "greeter 1: george"
}
and VM 2:
curl -s 127.0.0.1:5000/greet/george | jq .
{
"message": "greeter 2: george"
}
Both these VMs run Envoy 1.4.0:
/opt/envoy/1.4.0/bin/envoy version: a8507f67225cdd912712971bf72d41f219eb74ed/Modified/DEBUG
with the following configuration file:
{
"admin": {
"access_log_path": "/var/log/envoy/admin.access_log_path",
"address": "tcp://127.0.0.1:8480"
},
"cluster_manager": {
"cds": {
"cluster": {
"connect_timeout_ms": 250,
"hosts": [
{
"url": "tcp://127.0.0.1:8400"
}
],
"lb_type": "round_robin",
"name": "cds",
"type": "static"
},
"refresh_delay_ms": 15000
},
"clusters": [
{
"connect_timeout_ms": 250,
"hosts": [
{
"url": "tcp://127.0.0.1:8400"
}
],
"lb_type": "round_robin",
"name": "rds",
"type": "static"
}
],
"sds": {
"cluster": {
"connect_timeout_ms": 250,
"hosts": [
{
"url": "tcp://127.0.0.1:8400"
}
],
"lb_type": "round_robin",
"name": "sds",
"type": "static"
},
"refresh_delay_ms": 15000
}
},
"listeners": [
{
"address": "tcp://0.0.0.0:8180",
"filters": [
{
"config": {
"codec_type": "auto",
"filters": [
{
"config": {},
"name": "router",
"type": "decoder"
}
],
"rds": {
"cluster": "rds",
"route_config_name": "egress"
},
"stat_prefix": "egress_http"
},
"name": "http_connection_manager",
"type": "read"
}
]
},
{
"address": "tcp://0.0.0.0:8181",
"filters": [
{
"config": {
"codec_type": "auto",
"filters": [
{
"config": {},
"name": "router",
"type": "decoder"
}
],
"rds": {
"cluster": "rds",
"route_config_name": "ingress"
},
"stat_prefix": "ingress_http"
},
"name": "http_connection_manager",
"type": "read"
}
]
}
],
"statsd_udp_ip_address": "127.0.0.1:8125"
}
The cluster, route and service discover services are served from another Flask app where all definitions are static (to make debugging easier):
import os
from flask import Flask
from flask import jsonify
app = Flask(__name__)
_clusters = [{
'name': 'greeter-egress',
'type': 'sds',
'lb_type': 'round_robin',
'service_name': 'greeter-egress',
'connect_timeout_ms': 250,
'health_check': {
'type': 'http',
'timeout_ms': 15000,
'interval_ms': 10000,
'unhealthy_threshold': 3,
'healthy_threshold': 3,
'path': '/healthz',
'service_name': 'greeter-egress'
}
}, {
'name': 'greeter-ingress',
'type': 'sds',
'lb_type': 'round_robin',
'service_name': 'greeter-ingress',
'connect_timeout_ms': 250,
'health_check': {
'type': 'http',
'timeout_ms': 15000,
'interval_ms': 10000,
'unhealthy_threshold': 3,
'healthy_threshold': 3,
'path': '/healthz',
'service_name': 'greeter-ingress'
}
}]
_services = {
'greeter-egress': {
'name': 'greeter',
'hosts': [{
'name': '',
'ip_address': os.getenv('REMOTE_GREETER_HOST'),
'port': 8181,
'tags': {}
}, {
'name': '',
'ip_address': '127.0.0.1',
'port': 8181,
'tags': {}
}]
},
'greeter-ingress': {
'name': 'greeter',
'hosts': [{
'name': '',
'ip_address': '127.0.0.1',
'port': 5000,
'tags': {}
}]
},
}
_routes = {
'egress': [{
'name': 'greeter',
'domains': ['*'],
'routes': [{
'prefix': '/',
'cluster': 'greeter-egress'
}]
}],
'ingress': [{
'name': 'greeter',
'domains': ['*'],
'routes': [{
'prefix': '/',
'cluster': 'greeter-ingress'
}]
}]
}
@app.route('/v1/clusters/<cluster>/<host>')
def clusters(cluster, host):
return jsonify(clusters=_clusters)
@app.route('/v1/registration/<name>')
def service(name):
return jsonify(_services[name])
@app.route('/v1/routes/egress/<cluster>/<host>')
def egress_routes(cluster, host):
return jsonify(virtual_hosts=_routes['egress'])
@app.route('/v1/routes/ingress/<cluster>/<host>')
def ingress_routes(cluster, host):
return jsonify(virtual_hosts=_routes['ingress'])
app.run(port=8400)
When a client makes an HTTP request to the greeter service, it sends the HTTP request to the egress listener on 127.0.0.1:8180. The egress listener then proxies the request to the ingress listener on 127.0.0.1:8181, or the ingress listener on the other VM which also provides the greeter service (round-robin load balancer). The ingress listener forwards this proxied request to the greeter service on 127.0.0.1:5000.
curl -s 127.0.0.1:8180/greet/george | jq .
{
"message": "greeter 1: george"
}
curl -s 127.0.0.1:8180/greet/george | jq .
{
"message": "greeter 2: george"
}
If I change /healthz on greeter 1 to return HTTP 503 , the active health checks in Envoy detect that greeter 1 is down:
@app.route('/healthz')
def healthz():
return 'service unavailable', 503
as shown in the log output on both VMs:
[2017-10-16 13:01:19.927][18507][debug][hc] source/common/upstream/health_checker_impl.cc:304] [C106] hc response=503 health_flags=/failed_active_hc
however after 3 consecutive failures HTTP requests do not failover to greeter 2 and continue to be load balanced as normal:
curl -s 127.0.0.1:8180/greet/george | jq .
{
"message": "greeter 2: george"
}
curl -s 127.0.0.1:8180/greet/george | jq .
{
"message": "greeter 1: george"
}
curl -s 127.0.0.1:8180/greet/george | jq .
{
"message": "greeter 2: george"
}
in the case where greeter 1 fails and the HTTP server crashes we get the following error:
curl -s 127.0.0.1:8180/greet/george
{
"message": "greeter 2: george"
}
curl -s 127.0.0.1:8180/greet/george
upstream connect error or disconnect/reset before headers
curl -s 127.0.0.1:8180/greet/george
{
"message": "greeter 2: george"
}
Closing as answered. Please reopen if there are further questions.
For anyone finding this after the fact: the above link is now broken. I tracked down the correct one.