I have CouchDB 2.0.0-2 running on Ubuntu 14.04.3 using this AWS image https://docs.bitnami.com/aws/infrastructure/couchdb/.
My CouchDB instance is pretty small. It has about 1300 databases with an average of 300KB per database and 2 database with less than 100MB of data each.
I have a NodeJS process that listens to the _changes feed of all databases except the two big ones and when there is a change does some processing and saves a document in one of the big databases.
It was working pretty well for some time but now when I start the NodeJS process CouchDB stops responding and I see a lot of errors like this in the logs:
[error] 2017-09-06T12:54:23.113497Z couchdb@localhost <0.2166.3> 0ae11b116b rexi_server exit:{timeout,{gen_server,call,[couch_server,{open,<<"shards/00000000-1fffffff/ruum_1491318340205uh4nvaer9gywpo5g3w99.1496998111">>,[{timeout,100},{user_ctx,{user_ctx,<<"admin">>,[<<"_admin">>],<<"cookie">>}}]},100]}} [{gen_server,call,3,[{file,"gen_server.erl"},{line,212}]},{couch_server,open,2,[{file,"src/couch_server.erl"},{line,86}]},{couch_db,open,2,[{file,"src/couch_db.erl"},{line,91}]},{fabric_rpc,open_shard,2,[{file,"src/fabric_rpc.erl"},{line,248}]},{rexi_server,init_p,3,[{file,"src/rexi_server.erl"},{line,139}]}]
[error] 2017-09-06T12:54:23.113783Z couchdb@localhost <0.2165.3> 83178616c2 rexi_server exit:{timeout,{gen_server,call,[couch_server,{open,<<"shards/00000000-1fffffff/ruum_1496396724194h2xoq4yjjoj1bb0lblyu.1497006992">>,[{timeout,100},{user_ctx,{user_ctx,<<"admin">>,[<<"_admin">>],<<"cookie">>}}]},100]}} [{gen_server,call,3,[{file,"gen_server.erl"},{line,212}]},{couch_server,open,2,[{file,"src/couch_server.erl"},{line,86}]},{couch_db,open,2,[{file,"src/couch_db.erl"},{line,91}]},{fabric_rpc,open_shard,2,[{file,"src/fabric_rpc.erl"},{line,248}]},{rexi_server,init_p,3,[{file,"src/rexi_server.erl"},{line,139}]}]
[error] 2017-09-06T12:54:23.114188Z couchdb@localhost <0.2198.3> 1d0e99faff rexi_server exit:{timeout,{gen_server,call,[couch_server,{open,<<"shards/00000000-1fffffff/ruum_1490627286973_kdy7bhi5zbzzk6aa2yk5.1496997982">>,[{timeout,100},{user_ctx,{user_ctx,<<"admin">>,[<<"_admin">>],<<"cookie">>}}]},100]}} [{gen_server,call,3,[{file,"gen_server.erl"},{line,212}]},{couch_server,open,2,[{file,"src/couch_server.erl"},{line,86}]},{couch_db,open,2,[{file,"src/couch_db.erl"},{line,91}]},{fabric_rpc,open_shard,2,[{file,"src/fabric_rpc.erl"},{line,248}]},{rexi_server,init_p,3,[{file,"src/rexi_server.erl"},{line,139}]}]
I should be able to listen to the _changes of all my databases without it causing CouchDB to stop being responsive.
When I start the NodeJS process CouchDB stops responding.
I think this is not a bug but most likely something I am doing wrong in the configurations.
Try increasing max_dbs_open and see if it helps
[couchdb]
max_dbs_open = 2500
As you increase it, keep an eye on memory usage
hey @nickva , I already had increased it to 5000 but the problem is still there, memory is not an issue as well, I'm running on a EC2 t.2xlarge instance.
we just run on the same issue again. It is weird, it runs for some time and then I start getting those timeout errors. CPU and Memory utilization below 50%.

This is my configuration:
{
"native_query_servers": {
"query": "{mango_native_proc, start_link, []}"
},
"httpd_design_handlers": {
"_compact": "{couch_mrview_http, handle_compact_req}",
"_info": "{couch_mrview_http, handle_info_req}",
"_list": "{couch_mrview_show, handle_view_list_req}",
"_rewrite": "{couch_httpd_rewrite, handle_rewrite_req}",
"_show": "{couch_mrview_show, handle_doc_show_req}",
"_update": "{couch_mrview_show, handle_doc_update_req}",
"_view": "{couch_mrview_http, handle_view_req}",
"_view_changes": "{couch_mrview_http, handle_view_changes_req}"
},
"uuids": {
"algorithm": "sequential",
"max_count": "1000"
},
"cluster": {
"n": "3",
"q": "8"
},
"cors": {
"credentials": "true",
"headers": "accept, authorization, content-type, origin, referer",
"methods": "GET, PUT, POST, HEAD, DELETE",
"origins": "*"
},
"chttpd": {
"backlog": "512",
"bind_address": "0.0.0.0",
"docroot": "./share/www",
"port": "5984",
"socket_options": "[{recbuf, 262144}, {sndbuf, 262144}, {nodelay, true}]"
},
"httpd_global_handlers": {
"/": "{couch_httpd_misc_handlers, handle_welcome_req, <<\"Welcome\">>}",
"_active_tasks": "{couch_httpd_misc_handlers, handle_task_status_req}",
"_all_dbs": "{couch_httpd_misc_handlers, handle_all_dbs_req}",
"_config": "{couch_httpd_misc_handlers, handle_config_req}",
"_oauth": "{couch_httpd_oauth, handle_oauth_req}",
"_plugins": "{couch_plugins_httpd, handle_req}",
"_replicate": "{couch_replicator_httpd, handle_req}",
"_restart": "{couch_httpd_misc_handlers, handle_restart_req}",
"_session": "{couch_httpd_auth, handle_session_req}",
"_stats": "{couch_stats_httpd, handle_stats_req}",
"_system": "{chttpd_misc, handle_system_req}",
"_utils": "{couch_httpd_misc_handlers, handle_utils_dir_req, \"./share/www\"}",
"_uuids": "{couch_httpd_misc_handlers, handle_uuids_req}",
"favicon.ico": "{couch_httpd_misc_handlers, handle_favicon_req, \"/opt/bitnami/couchdb/share/www\"}"
},
"attachments": {
"compressible_types": "text/*, application/javascript, application/json, application/xml",
"compression_level": "8"
},
"query_server_config": {
"os_process_limit": "10",
"reduce_limit": "true"
},
"vendor": {
"name": "The Apache Software Foundation"
},
"replicator": {
"connection_timeout": "30000",
"http_connections": "20",
"max_replication_retry_count": "10",
"retries_per_request": "10",
"socket_options": "[{keepalive, true}, {nodelay, false}]",
"ssl_certificate_max_depth": "3",
"start_delay": "0",
"start_splay": "0",
"verify_ssl_certificates": "false",
"worker_batch_size": "500",
"worker_processes": "4"
},
"couch_httpd_oauth": {
"use_users_db": "false"
},
"ssl": {
"port": "6984"
},
"log": {
"level": "info",
"writer": "stderr"
},
"indexers": {
"couch_mrview": "true"
},
"view_compaction": {
"keyvalue_buffer_size": "2097152"
},
"query_servers": {
"coffeescript": "/opt/bitnami/couchdb/bin/couchjs /opt/bitnami/couchdb/share/server/main-coffee.js",
"javascript": "/opt/bitnami/couchdb/bin/couchjs /opt/bitnami/couchdb/share/server/main.js"
},
"daemons": {
"auth_cache": "{couch_auth_cache, start_link, []}",
"compaction_daemon": "{couch_compaction_daemon, start_link, []}",
"couch_peruser": "{couch_peruser, start_link, []}",
"external_manager": "{couch_external_manager, start_link, []}",
"httpd": "{couch_httpd, start_link, []}",
"index_server": "{couch_index_server, start_link, []}",
"os_daemons": "{couch_os_daemons, start_link, []}",
"query_servers": "{couch_proc_manager, start_link, []}",
"uuids": "{couch_uuids, start, []}",
"vhosts": "{couch_httpd_vhost, start_link, []}"
},
"couch_peruser": {
"delete_dbs": "false",
"enable": "false"
},
"httpd": {
"WWW-Authenticate": "Basic realm=\"Administrator\"",
"allow_jsonp": "false",
"authentication_handlers": "{couch_httpd_oauth, oauth_authentication_handler}, {couch_httpd_auth, cookie_authentication_handler}, {couch_httpd_auth, default_authentication_handler}",
"bind_address": "127.0.0.1",
"default_handler": "{couch_httpd_db, handle_request}",
"enable_cors": "true",
"port": "5986",
"secure_rewrites": "true",
"socket_options": "[{recbuf, 262144}, {sndbuf, 262144}]",
"vhost_global_handlers": "_utils, _uuids, _session, _oauth, _users"
},
"httpd_db_handlers": {
"_all_docs": "{couch_mrview_http, handle_all_docs_req}",
"_changes": "{couch_httpd_db, handle_db_changes_req}",
"_compact": "{couch_httpd_db, handle_compact_req}",
"_design": "{couch_httpd_db, handle_design_req}",
"_design_docs": "{couch_mrview_http, handle_design_docs_req}",
"_local_docs": "{couch_mrview_http, handle_local_docs_req}",
"_temp_view": "{couch_mrview_http, handle_temp_view_req}",
"_view_cleanup": "{couch_mrview_http, handle_cleanup_req}"
},
"database_compaction": {
"checkpoint_after": "5242880",
"doc_buffer_size": "524288"
},
"csp": {
"enable": "true"
},
"couch_httpd_auth": {
"allow_persistent_cookies": "true",
"auth_cache_size": "50",
"authentication_db": "_users",
"authentication_redirect": "/_utils/session.html",
"iterations": "10",
"require_valid_user": "true",
"timeout": "604800"
},
"couchdb": {
"attachment_stream_buffer_size": "4096",
"database_dir": "/opt/bitnami/couchdb/var/lib/couchdb",
"default_security": "admin_local",
"delayed_commits": "false",
"file_compression": "snappy" g,
"max_dbs_open": "5000",
"max_document_size": "67108864",
"os_process_timeout": "25000",
"plugin_dir": "/opt/bitnami/couchdb/lib/couchdb/plugins",
"uuid": "88f2a33c450a742c62ec55ab26bbb6ec",
"view_index_dir": "/opt/bitnami/couchdb/var/lib/couchdb"
},
"compaction_daemon": {
"check_interval": "300",
"min_file_size": "131072"
}
}
This error might be more helpful in order to find the problem:
It says: No DB shards could be opened.
What does that mean ?
[error] 2017-09-06T17:58:34.380145Z couchdb@localhost <0.14476.8> 8c3f780d86 rexi_server exit:{timeout,{gen_server,call,[couch_server,{open,<<"shards/40000000-5fffffff/ruum_1500369224024ttg2zer6jqcuiwa098li.1500369260">>,[{timeout,400},{user_ctx,{user_ctx,<<"admin">>,[<<"_admin">>],<<"default">>}}]},400]}} [{gen_server,call,3,[{file,"gen_server.erl"},{line,212}]},{couch_server,open,2,[{file,"src/couch_server.erl"},{line,86}]},{couch_db,open,2,[{file,"src/couch_db.erl"},{line,91}]},{fabric_rpc,open_shard,2,[{file,"src/fabric_rpc.erl"},{line,248}]},{rexi_server,init_p,3,[{file,"src/rexi_server.erl"},{line,139}]}]
[error] 2017-09-06T17:58:34.380466Z couchdb@localhost <0.20913.7> bc8c6439c1 req_err(801399354) internal_server_error : No DB shards could be opened.
[<<"fabric_util:get_shard/4 L180">>,<<"fabric_util:get_shard/4 L195">>,<<"fabric_util:get_shard/4 L195">>,<<"fabric:get_security/2 L146">>,<<"chttpd_auth_request:db_authorization_check/1 L87">>,<<"chttpd_auth_request:authorize_request/1 L19">>,<<"chttpd:process_request/1 L291">>,<<"chttpd:handle_request_int/1 L229">>]
Once it happens the only thing that temporarily fixes it is to restart the database.
Still sounds like you're running into a max fds issue.
I know nothing about that bitnami image, but do you know how it starts CouchDB? If it's via systemd, systemd has its own overrides on fd limits that need modifying. There are other places you need to change fd limits as well.
Some resources:
This is the limits for the process running the couchdb

bitnami@ip-172-31-3-49:~$ sudo cat /proc/27037/limits
Limit Soft Limit Hard Limit Units
Max cpu time unlimited unlimited seconds
Max file size unlimited unlimited bytes
Max data size unlimited unlimited bytes
Max stack size 8388608 unlimited bytes
Max core file size 0 unlimited bytes
Max resident set unlimited unlimited bytes
Max processes 32768 32768 processes
Max open files 100000 100000 files
Max locked memory 65536 65536 bytes
Max address space unlimited unlimited bytes
Max file locks unlimited unlimited locks
Max pending signals 257326 257326 signals
Max msgqueue size 819200 819200 bytes
Max nice priority 0 0
Max realtime priority 0 0
Max realtime timeout unlimited unlimited us
The limit is 100k which I would assume is enough ?
also
bitnami@ip-172-31-3-49:~$ cat /proc/sys/fs/file-max
100000
bitnami@ip-172-31-3-49:~$ ulimit -Sn
100000
bitnami@ip-172-31-3-49:~$ ulimit -Hn
100000
Just checking if anybody else knows what else could be the problem ? Also, is it correct to expect that CouchDB should be able to handle thousands of clients listening to the _changes feed ?
I'm afraid I don't have any free time to further look at your issue. There are companies offering commercial CouchDB support if that suits your needs.
Yes, CouchDB can handle "thousands of clients" listening to a _changes feed, though you'd be better served with a 3-node cluster behind a load balancer rather than a single machine handling all that load.
yes, I would be interested in recommendations of companies offering professional support.
Can you run the Node process on a different machine? My guess is disk performance in your VM is the big issue.
Someone from https://neighbourhood.ie should have reached out to you already; if not, you can contact them directly.
@wohali , the node process is already running on a different machine. I have contacted neighbourhood.ie.
@FelipeTaiarol please share you findings, because we have similar problem on couchdb with practically no load which has 100.000+ tiny databases. And we also have raised max file open limit's.
Looking again at the error reported:
[error] 2017-09-06T12:54:23.113497Z couchdb@localhost <0.2166.3> 0ae11b116b rexi_server exit:{timeout,{gen_server,call,[couch_server,{open,<<"shards/00000000-1fffffff/ruum_1491318340205uh4nvaer9gywpo5g3w99.1496998111">>,[{timeout,100},{user_ctx,{user_ctx,<<"admin">>,[<<"_admin">>],<<"cookie">>}}]},100]}} [{gen_server,call,3,[{file,"gen_server.erl"},{line,212}]},{couch_server,open,2,[{file,"src/couch_server.erl"},{line,86}]},{couch_db,open,2,[{file,"src/couch_db.erl"},{line,91}]},{fabric_rpc,open_shard,2,[{file,"src/fabric_rpc.erl"},{line,248}]},{rexi_server,init_p,3,[{file,"src/rexi_server.erl"},{line,139}]}]
what we're seeing here is a 100ms timeout on couch_server opening a shard on disk. The likely culprits here are slow EBS store or maxxed out CPU. If you can rule out the second, you are almost certainly being hit by the first. AWS offers higher tiers of EBS that you will want to investigate, and specifically "the highest performance Provisioned IOPS SSD (io1) for latency-sensitive transactional workloads". You will want to ensure at the very least that your data directory (/opt/bitnami/couchdb/var/lib/couchdb per above) is on such a volume. You can also improve throughput by not logging to the local node, and instead using syslog to stream the logs to a separate logging server.
CouchDB is a high-performance product expecting to run on best available hardware. If you're running it in the cloud, you need to be aware of "gotchas" like this.
@FelipeTaiarol it's worth noting that the [cluster] block controls the number of shard files that are created per database. The default value in your configuration is 8, so that means your 1300 databases actually correspond to 10400 unique shard files. The max_dbs_open value actually controls the number of concurrently open shard files, so it may be that you're still asking the process that manages the list of open databases to do a lot of work to keep at most 5000 of them open.
If your database count is going to hold steady at 1300 I would look at increasing the limit again. If you are going to keep creating new databases you probably want to be using a smaller value for [cluster] q. The small databases probably don't need to be sharded and so you can just set that to 1. It is possible to override the default when creating a database using the query string, e.g. PUT /dbname?q=1.
Thanks @kocolosk , I increased the max_dbs_open to 15k and the set [cluster] q=2 in my local.ini
@wohali , it might be that having a faster hard drive would fix it but I find it hard to believe CouchDB needs it with the kind of load we are dealing. The max open files makes more sense I think.
@FelipeTaiarol Did the max_dbs_open resolve this for you?
I am still suspicious of disk latency being the issue here.
@FelipeTaiarol Please respond if this issue is still a problem with max_dbs_open increased to 15k and using q=2.
If and when CouchDB ends up in an unresponsive state, please check the output of localhost:5986/_system and look at the message queue sizes. Is the couch_server message queue high?
What those fabric timeout errors mean: Couch is trying to open a shard quickly to grab the security properties, and that's why the timeout is set to 100ms. If the request fails, it tries a different shard on a different node, and doubles the timeout. It will keep trying, doubling the timeout each time, until it runs out of shards, at which point you will get the No DB shards could be opened error.
@FelipeTaiarol and any future Google search people, another option to set is the update_lru_on_read option. This significantly decreases the message volume that couch_server has to deal with.
Just add this to your config (more specifically, just add that option to the [couchdb] section):
[couchdb]
update_lru_on_read = false
We stopped listening to all databases' _changes feed and switched to listen only to /_db_updates, at the same time we increased the max_dbs_open to 15k and the set [cluster] q=2, not sure which one of the two solved the issue.
@FelipeTaiarol OK, thanks. FYI for /_db_updates be aware of this recently reported issue: https://github.com/apache/couchdb/issues/1063
*Solved - scroll down to read solution
Hi,
I have the same issue with 60 dbs, running on a local computer, each db has 4 views, up to 2 millions docs for every DB. only the last db is being updated.
I am getting emfile error, and shard db error, and errors are changing according to the query range.
Sometimes only one db returns error, sometimes multiple.
https://gist.github.com/adyshimony/d5b9fec95a7ec7544095cef398960c5c
I set up the config exactly as written here
http://docs.couchdb.org/en/stable/maintenance/performance.html#system-resource-limits
Checking ulimit -n returns 64000
Also set max_dbs_open is 15k
Also added
[couchdb]
update_lru_on_read = false
But when checking for service limits:
├─beam.smp(1532)─┬─couchjs(3546)───{couchjs}(3554)
│ ├─couchjs(3547)───{couchjs}(3555)
│ ├─couchjs(3548)───{couchjs}(3553)
cat /proc/3546/limits
I am getting Max open files 1024
For every process in the list, even that I configured this:
/etc/systemd/system/couchdb.d/override.conf
[Service]
LimitNOFILE=64000.
Solutions found. I am not deleting this cause maybe it will help someone else.
Add
/etc/systemd/system/couchdb.service.d/limits.conf
[Service]
LimitNOFILE=64000
Fixed the issue with all the other things I mention above.
/etc/systemd/system/couchdb.d/override.conf is not enough if working at all.
Most helpful comment
*Solved - scroll down to read solution
Hi,
I have the same issue with 60 dbs, running on a local computer, each db has 4 views, up to 2 millions docs for every DB. only the last db is being updated.
I am getting emfile error, and shard db error, and errors are changing according to the query range.
Sometimes only one db returns error, sometimes multiple.
https://gist.github.com/adyshimony/d5b9fec95a7ec7544095cef398960c5c
I set up the config exactly as written here
http://docs.couchdb.org/en/stable/maintenance/performance.html#system-resource-limits
Checking ulimit -n returns 64000
Also set max_dbs_open is 15k
Also added
[couchdb]
update_lru_on_read = false
But when checking for service limits:
├─beam.smp(1532)─┬─couchjs(3546)───{couchjs}(3554)
│ ├─couchjs(3547)───{couchjs}(3555)
│ ├─couchjs(3548)───{couchjs}(3553)
cat /proc/3546/limits
I am getting Max open files 1024
For every process in the list, even that I configured this:
/etc/systemd/system/couchdb.d/override.conf
[Service]
LimitNOFILE=64000.
Solutions found. I am not deleting this cause maybe it will help someone else.
Add
/etc/systemd/system/couchdb.service.d/limits.conf
[Service]
LimitNOFILE=64000
Fixed the issue with all the other things I mention above.
/etc/systemd/system/couchdb.d/override.conf is not enough if working at all.