I'm having an issue with that from what I can tell is related to zfs, I could be wrong though.
I have the directory that I'm unable to run ls on, or rather ls never returns.
When I run ls on the directory with strace I get the following, i'm not sure if this is of any help.
# strace ls /var/cache/pacman/pkg/ john@Archon
execve("/usr/sbin/ls", ["ls", "/var/cache/pacman/pkg/"], [/* 16 vars */]) = 0
brk(NULL) = 0x656000
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=191377, ...}) = 0
mmap(NULL, 191377, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f52fb53c000
close(3) = 0
open("/usr/lib/libcap.so.2", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0`\25\0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0644, st_size=17256, ...}) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f52fb53a000
mmap(NULL, 2112504, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f52fb145000
mprotect(0x7f52fb149000, 2093056, PROT_NONE) = 0
mmap(0x7f52fb348000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x3000) = 0x7f52fb348000
close(3) = 0
open("/usr/lib/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\260\3\2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1951744, ...}) = 0
mmap(NULL, 3791152, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f52fada7000
mprotect(0x7f52faf3c000, 2093056, PROT_NONE) = 0
mmap(0x7f52fb13b000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x194000) = 0x7f52fb13b000
mmap(0x7f52fb141000, 14640, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f52fb141000
close(3) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f52fb538000
arch_prctl(ARCH_SET_FS, 0x7f52fb538700) = 0
mprotect(0x7f52fb13b000, 16384, PROT_READ) = 0
mprotect(0x61d000, 4096, PROT_READ) = 0
mprotect(0x7f52fb56b000, 4096, PROT_READ) = 0
munmap(0x7f52fb53c000, 191377) = 0
brk(NULL) = 0x656000
brk(0x677000) = 0x677000
open("/usr/lib/locale/locale-archive", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=1669168, ...}) = 0
mmap(NULL, 1669168, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f52fb3a0000
close(3) = 0
ioctl(1, TCGETS, {B38400 opost isig icanon echo ...}) = 0
ioctl(1, TIOCGWINSZ, {ws_row=65, ws_col=99, ws_xpixel=0, ws_ypixel=0}) = 0
stat("/var/cache/pacman/pkg/", {st_mode=S_IFDIR|0755, st_size=2461, ...}) = 0
open("/var/cache/pacman/pkg/", O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFDIR|0755, st_size=2461, ...}) = 0
getdents(3, /* 553 entries */, 32768) = 32712
brk(0x6a2000) = 0x6a2000
brk(0x698000) = 0x698000
getdents(3, /* 555 entries */, 32768) = 32760
mmap(NULL, 311296, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f52fad5b000
getdents(3, /* 551 entries */, 32768) = 32760
mremap(0x7f52fad5b000, 311296, 618496, MREMAP_MAYMOVE) = 0x7f52facc4000
getdents(3,
I also had a seemingly related problem with a few months ago that I "resolved" by renaming a directory and forgetting about it. The problem that occurred last time was I was unable to delete a directory that seem to be corrupted, I would end up with rm -rf just not returning.
strace rm -rf /home/john/.cache/google-chrome/Default/Cache
execve("/usr/sbin/rm", ["rm", "-rf", "/home/john/.cache/google-chrome/"...], [/* 34 vars */]) = 0
brk(NULL) = 0x259a000
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=177065, ...}) = 0
mmap(NULL, 177065, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7fc18f0a4000
close(3) = 0
open("/usr/lib/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\260\3\2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1951744, ...}) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fc18f0a2000
mmap(NULL, 3791152, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fc18eb10000
mprotect(0x7fc18eca5000, 2093056, PROT_NONE) = 0
mmap(0x7fc18eea4000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x194000) = 0x7fc18eea4000
mmap(0x7fc18eeaa000, 14640, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fc18eeaa000
close(3) = 0
arch_prctl(ARCH_SET_FS, 0x7fc18f0a3480) = 0
mprotect(0x7fc18eea4000, 16384, PROT_READ) = 0
mprotect(0x60d000, 4096, PROT_READ) = 0
mprotect(0x7fc18f0d0000, 4096, PROT_READ) = 0
munmap(0x7fc18f0a4000, 177065) = 0
brk(NULL) = 0x259a000
brk(0x25bb000) = 0x25bb000
open("/usr/lib/locale/locale-archive", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=1669168, ...}) = 0
mmap(NULL, 1669168, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7fc18ef0a000
close(3) = 0
ioctl(0, TCGETS, {B38400 opost isig icanon echo ...}) = 0
lstat("/", {st_mode=S_IFDIR|0755, st_size=20, ...}) = 0
newfstatat(AT_FDCWD, "/home/john/.cache/google-chrome/Default/Cache", {st_mode=S_IFDIR|0700, st_size=9600, ...}, AT_SYMLINK_NOFOLLOW) = 0
openat(AT_FDCWD, "/home/john/.cache/google-chrome/Default/Cache", O_RDONLY|O_NOCTTY|O_NONBLOCK|O_DIRECTORY|O_NOFOLLOW) = 3
fstat(3, {st_mode=S_IFDIR|0700, st_size=9600, ...}) = 0
fcntl(3, F_GETFL) = 0x38800 (flags O_RDONLY|O_NONBLOCK|O_LARGEFILE|O_DIRECTORY|O_NOFOLLOW)
fcntl(3, F_SETFD, FD_CLOEXEC) = 0
getdents(3,
I was able to do the same with this directory by renaming it but again I was unable to delete it. When I attempt to, the command just hangs.
Running strace on it I get:
# strace rm -rf /var/cache/pacman/.broken-pkg-dont-touch john@Archon
execve("/usr/sbin/rm", ["rm", "-rf", "/var/cache/pacman/.broken-pkg-do"...], [/* 17 vars */]) = 0
brk(NULL) = 0x2576000
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=191949, ...}) = 0
mmap(NULL, 191949, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f2baafa5000
close(3) = 0
open("/usr/lib/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\260\3\2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1951744, ...}) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f2baafa3000
mmap(NULL, 3791152, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f2baaa14000
mprotect(0x7f2baaba9000, 2093056, PROT_NONE) = 0
mmap(0x7f2baada8000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x194000) = 0x7f2baada8000
mmap(0x7f2baadae000, 14640, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f2baadae000
close(3) = 0
arch_prctl(ARCH_SET_FS, 0x7f2baafa4480) = 0
mprotect(0x7f2baada8000, 16384, PROT_READ) = 0
mprotect(0x60d000, 4096, PROT_READ) = 0
mprotect(0x7f2baafd4000, 4096, PROT_READ) = 0
munmap(0x7f2baafa5000, 191949) = 0
brk(NULL) = 0x2576000
brk(0x2597000) = 0x2597000
open("/usr/lib/locale/locale-archive", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=1669168, ...}) = 0
mmap(NULL, 1669168, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f2baae0b000
close(3) = 0
ioctl(0, TCGETS, {B38400 opost isig icanon echo ...}) = 0
lstat("/", {st_mode=S_IFDIR|0755, st_size=21, ...}) = 0
newfstatat(AT_FDCWD, "/var/cache/pacman/.broken-pkg-dont-touch", {st_mode=S_IFDIR|0755, st_size=2461, ...}, AT_SYMLINK_NOFOLLOW) = 0
openat(AT_FDCWD, "/var/cache/pacman/.broken-pkg-dont-touch", O_RDONLY|O_NOCTTY|O_NONBLOCK|O_DIRECTORY|O_NOFOLLOW) = 3
fstat(3, {st_mode=S_IFDIR|0755, st_size=2461, ...}) = 0
fcntl(3, F_GETFL) = 0x38800 (flags O_RDONLY|O_NONBLOCK|O_LARGEFILE|O_DIRECTORY|O_NOFOLLOW)
fcntl(3, F_SETFD, FD_CLOEXEC) = 0
getdents(3, /* 553 entries */, 32768) = 32712
close(3) = 0
openat(AT_FDCWD, "/var/cache/pacman/.broken-pkg-dont-touch", O_RDONLY|O_NOCTTY|O_NONBLOCK|O_DIRECTORY|O_NOFOLLOW) = 3
fcntl(3, F_GETFD) = 0
fcntl(3, F_SETFD, FD_CLOEXEC) = 0
fstat(3, {st_mode=S_IFDIR|0755, st_size=2461, ...}) = 0
fcntl(3, F_GETFL) = 0x38800 (flags O_RDONLY|O_NONBLOCK|O_LARGEFILE|O_DIRECTORY|O_NOFOLLOW)
fcntl(3, F_SETFD, FD_CLOEXEC) = 0
fcntl(3, F_DUPFD, 3) = 4
fcntl(4, F_GETFD) = 0
fcntl(4, F_SETFD, FD_CLOEXEC) = 0
getdents(3, /* 553 entries */, 32768) = 32712
brk(0x25b8000) = 0x25b8000
getdents(3, /* 555 entries */, 32768) = 32760
brk(0x25d9000) = 0x25d9000
getdents(3, /* 551 entries */, 32768) = 32760
brk(0x25fa000) = 0x25fa000
brk(0x261b000) = 0x261b000
getdents(3,
I ran perf top and it shows that zfs_leaf_lookup_closest at the top with 90%+ overhead.
Some suggestions for you to answer, to increase the chances of a reply to your issue:
o What distro are you running
o What kernel version
o What ZFS code version - ' dpkg -l|grep zfs ' if debian/ubuntu
o zpool configuration (raidzX, striped mirrors, etc) - results of ' zpool status ' at least
o What kind of disks make up the zpool (manufacturer, model) and how are they connected (motherboard SATA, SAS card?)
o What zfs dataset options do you have enabled (atime, etc) - ' zfs get all '
Just my $2.02...
Running arch linux, zfs-linux 0.6.5.8_4.8.4_1-1 (archzfs-linux),
Mirrored sandisk extreme pro:
pool: vault
state: ONLINE
scan: scrub repaired 0 in 0h8m with 0 errors on Mon Oct 23 00:09:45 2016
config:
NAME STATE READ WRITE CKSUM
vault ONLINE 0 0 0
mirror-0 ONLINE 0 0 0
ata-SanDisk_SDSSDXPS480G_152271401093 ONLINE 0 0 0
ata-SanDisk_SDSSDXPS480G_154501401266 ONLINE 0 0 0
errors: No known data errors
Created with:
zpool create -f -o ashift=12 vault mirror \
ata-SanDisk_SDSSDXPS480G_152271401093 \
ata-SanDisk_SDSSDXPS480G_154501401266
zfs set compression=on vault
zfs set atime=on vault
zfs set relatime=on vault
zfs create -o setuid=off \
-o devices=off \
-o sync=disabled \
-o mountpoint=/tmp vault/tmp
zfs create -o xattr=sa -o mountpoint=legacy vault/var
zfs create -o mountpoint=legacy vault/usr
zfs_leaf_lookup_closestat the top with 90%+ overhead.
@johnramsden
I cant find this function anywhere. Do you run with any patches.
Oh, I think it should be zap_leaf_lookup_closest.
Yea, mistyped. Any clue what may have caused this?
@johnramsden
Please try cat /proc/pid/stack or sysrq-l to get the stack trace of the command.
Also, do zdb -dddddd pool/fs inode_num with the inode number of the directory.
As in zdb -dddddd <poolname>/<dataset> <inode num>? I must be doing something wrong:
I got the inode with
ls -id /mnt/tank/replication/Archon/var/cache/pacman/.broken-pkg-dont-touch
9 /mnt/tank/replication/Archon/var/cache/pacman/.broken-pkg-dont-touch
# zdb -dddddd tank/replication/Archon/var 9
zdb: can't open 'tank/replication/Archon/var': No such file or directory
By the way, at this I had to destroy the dataset, however in order to figure out what was wrong I have sent the dataset to my freenas box. So I'm not doing the debugging on Linux anymore, I I'm not sure if the commands are still completely identical freebsd to linux.
@johnramsden
Then please try zdb -edddddd tank/replication/Archon/var 9
It got stuck.
~# zdb -edddddd tank/replication/Archon/var 9
Dataset tank/replication/Archon/var [ZPL], ID 13915, cr_txg 3635580, 9.41G, 19890 objects, rootbp DVA[0]=<0:1628ddd9b000:3000> DVA[1]=<0:54519669000:3000> [L0 DMU objset] fletcher4 uncompressed LE contiguous unique double size=800L/800P birth=6479680L/6479680P fill=19890 cksum=a3347a264:bbfc9206d78:8da3798cf8b5b:504b54971abd720
Object lvl iblk dblk dsize lsize %full type
9 2 16K 16K 736K 368K 100.00 ZFS directory (K=inherit) (Z=inherit)
168 bonus System attributes
dnode flags: USED_BYTES USERUSED_ACCOUNTED
dnode maxblkid: 22
path /cache/pacman/.broken-pkg-dont-touch
uid 0
gid 0
atime Thu Jun 9 16:54:02 2016
mtime Thu Oct 27 16:39:12 2016
ctime Fri Oct 28 13:33:03 2016
crtime Sun May 15 16:22:04 2016
gen 528
mode 40755
size 2461
parent 8
links 2
pflags 40800000144
@johnramsden how did you resolve this in the end?
I think I'm having the same issue.
BTW, I'm on FreeBSD, so if this is the same thing, the issue should probably be punted upstream.
@panta82 no resolution, I remade my pool.
I tried looking for an openzfs bug tracker but was unable to find one. All I could find were links to the illumos bug tracker. Do you happen to know where the best place to open a new bug report would be?
@johnramsden
Nope. I get the feeling zfs project is one of those mailing list + irc instead of github + slack operations, if you know what I mean.
So I guess I am rebuilding my pool. Or dropping zfs altogether, dont know yet.
@johnramsden Have a look at the OpenZFS participate wiki page. I think you can send your bug/issue you may discovered to the openZFS devs via email: developer
The author of this issue, @johnramsden, clearly had a corrupted fat zap. The underlying issue is how it happened and the secondary question is how to work around the corruption.
As to the workaround, it _should_ be possible to either roll back and/or destroy the dataset containing the corrupted directory, after, of course, preserving the data as necessary.
As to the cause, AFAIK, there aren't any known issues that will cause this type of corruption in a fat zap so long as zfs_nocacheflush is set to zero (the default) and the underlying disk vdev honors WRITE_FLUSH_FUA and/or WRITE_BARRIER flags correctly.
It does seem we could add some defensive code in zap_leaf_lookup_closest() to prevent the infinite loop.
I'd also suggest to anyone encountering this problem to build their ZFS module with debugging enabled. There are a bunch of extra ASSERTS which will be enabled and might help narrow down the corruption.
@dweeezil If you are referring to a property being set i'm just running defaults.
# zfs get zfs:zfs_nocacheflush vault
NAME PROPERTY VALUE SOURCE
vault zfs:zfs_nocacheflush
I haven't seen the problem since but i'm on the lookout. In the meantime I have build my current setup very modular so it's possible to destroy a dataset if this reoccurs.
I have the exact same issue on FreeBSD ZFS.
The hung thread is spinning in zap_leaf_lookup_closest().
As @dweeezil stated in https://github.com/zfsonlinux/zfs/issues/4583 (which I
believe is the same issue), this is a case of a corrupted fat ZAP.
The zap corruption is a zeroed zap_leaf_phys_t{}.
If ZFS is compiled with debug then the zero zap_leaf_phys_t{} will hit the
following assert)() in zap_get_leaf_byblk():
ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF)
Without debug the readdir() will loop forever in zap_leaf_lookup_closest()
because with the zero zap_leaf_phys_t{}, le->le_next is always 0:
for (chunk = zap_leaf_phys(l)->l_hash[lh];
chunk != CHAIN_END; chunk = le->le_next) {
le = ZAP_LEAF_ENTRY(l, chunk);
...
}
More details below from both kernels.
It's easy enough to avoid the infinite loop in zap_leaf_lookup_closest(),
but does anyone have any ideas on the root cause ?
Debug kernel details (hits assert() in zap_get_leaf_byblk())
at zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c:553
at zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c:605
at zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c:1202
at zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c:1299
at zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c:2684
at vnode_if.c:1821
at vnode_if.h:758
at sys/kern/vfs_syscalls.c:4031
from frame 13:
(kdbg) set $l = (zap_leaf_t *)0xfffff8014cda100
(kgdb) p *$l
$2 = {
l_dbu = {
dbu_tqent = {
tqent_task = {
ta_link = {
stqe_next = 0x0
},
ta_pending = 0,
ta_priority = 0,
ta_func = 0,
ta_context = 0x0
},
tqent_func = 0,
tqent_arg = 0x0
},
dbu_evict_func = 0xffffffff811540b0
dbu_clear_on_evict_dbufp = 0xfffff8014cda1070
},
l_rwlock = {
lock_object = {
lo_name = 0xffffffff8121f01b "l->l_rwlock",
lo_flags = 40960000,
lo_data = 0,
lo_witness = 0x0
},
sx_lock = 17
},
l_blkid = 1,
l_bs = 14,
l_dbuf = 0xfffff8014c1d8000
}
(kgdb) p *$l->l_dbuf
$3 = {
db_object = 217487,
db_offset = 16384,
db_size = 16384,
db_data = 0xfffffe0015319000
}
(kgdb) set $zlp = (zap_leaf_phys_t *)0xfffffe0015319000
(kgdb) p *$zlp
$4 = {
l_hdr = {
lh_block_type = 0,
lh_pad1 = 0,
lh_prefix = 0,
lh_magic = 0,
lh_nfree = 0,
lh_nentries = 0,
lh_prefix_len = 0,
lh_freelist = 0,
lh_flags = 0 '0',
lh_pad2 = "000000000000000000000000000000"
},
l_hash = {0}
}
(kgdb) x/4096xw 0xfffffe0015319000
0xfffffe0015319000: 0x00000000 0x00000000 0x00000000 0x00000000
0xfffffe0015319010: 0x00000000 0x00000000 0x00000000 0x00000000
...
...
...
0xfffffe001531cff0: 0x00000000 0x00000000 0x00000000 0x00000000
From frame 14:
(kgdb) p *zap
$5 = {
zap_dbu = {
dbu_tqent = {
tqent_task = {
ta_link = {
stqe_next = 0x0
},
ta_pending = 0,
ta_priority = 0,
ta_func = 0,
ta_context = 0x0
},
tqent_func = 0,
tqent_arg = 0x0
},
dbu_evict_func = 0xffffffff811583d0
dbu_clear_on_evict_dbufp = 0xfffff8004b2b3850
},
zap_objset = 0xfffff80020841000,
zap_object = 217487,
zap_dbuf = 0xfffff8014cd52310,
zap_rwlock = {
lock_object = {
lo_name = 0xffffffff81220390 "zap->zap_rwlock",
lo_flags = 40960000,
lo_data = 0,
lo_witness = 0x0
},
sx_lock = 17
},
zap_ismicro = 0,
zap_normflags = 0,
zap_salt = 26086651423,
zap_u = {
zap_fat = {
zap_num_entries_mtx = {
lock_object = {
lo_name = 0xffffffff812203a1 "zap->zap_u.zap_fat.zap_num_entries_mtx",
lo_flags = 40960000,
lo_data = 0,
lo_witness = 0x0
},
sx_lock = 1
},
zap_block_shift = 14
},
zap_micro = {
zap_num_entries = 929,
zap_num_chunks = -32478,
zap_alloc_next = -1,
zap_avl = {
avl_root = 0x2710000,
avl_compar = 0,
avl_offset = 1,
avl_numnodes = 14,
avl_size = 0
}
}
}
}
NON-Debug kernel details (spins in zap_leaf_lookup_closest()):
at zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c:492
at sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c:1211
sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c:1299
at sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c:2684
at vnode_if.c:1821
at vnode_if.h:758
at sys/kern/vfs_syscalls.c:4031
Variable values from zap_leaf_lookup_closest():
Initial variable values:
leaf 0xfffff801a1aa0500, leaf phys 0xfffffe0019615000, h 0x0, cd 0x0, lh 0x0, bestlh 0x1ff
Inside the for (chunk = zap_leaf_phys(l)->l_hash[lh]; .... ) loop:
zap_leaf_lookup_closest/494: chunk 0x0, le 0xfffffe0019615430, le_next 0x0, le_hash 0x0, le_cd 0x0
zap_leaf_lookup_closest/499: h 0x0, cd 0x0, besth 0xffffffffffffffff, bestcd 0xffffffff
zap_leaf_lookup_closest/516: update bestlh 0x0, besth 0x0, bestcd, 0x0
zap_leaf_lookup_closest/494: chunk 0x0, le 0xfffffe0019615430, le_next 0x0, le_hash 0x0, le_cd 0x0
zap_leaf_lookup_closest/499: h 0x0, cd 0x0, besth 0x0, bestcd 0x0
zap_leaf_lookup_closest/516: update bestlh 0x0, besth 0x0, bestcd, 0x0
...
...
repeat the 494, 499, 516 (line numbers) .....
Nice find @baukus.
Are you sure that your hardware is correct? Was there any power loss issues? I mean you use SSDs in your pool, and today's SSDs are not all trustworthy. Please check your device against https://github.com/rkojedzinszky/zfsziltest
If that passes multiple times your hardware can be trusted. If it fails even with 1 error you should not use them longer.
@rkojedzinszky : SSDs are not involved.
I'm not any closer to finding the root cause.
An observation while I wait for an epiphany:
The zeroed zap leaf is NOT a zeroed block on disk but rather a hole in the ZAP directory "file".
After hacking zdb to not assert() nor spin on the invalid directory, I get the following (which shows that there are not any DVAs allocated for offsets 0x4000 to 0xc000; the first ZAP leaf block is at offset 0x4000).
./zdb -ddddddddd poolXramerXXX/xxxxxxSaveSnap 217487
Dataset poolXramerXXX/xxxxxxSaveSnap [ZPL], ID 45, cr_txg 68, 1.55T, 683313 objects, rootbp DVA[0]=<2:50001d6200:200> DVA[1]=<3:5000216c00:200> [L0 DMU objset] fletcher4 lz4 LE contiguous unique double size=800L/200P birth=28262L/28262P fill=683313 cksum=11aad35e1b:5e8c8404994:10f286d875402:2265c43507a460
Object lvl iblk dblk dsize lsize %full type
217487 2 16K 16K 96.0K 176K 81.82 ZFS directory (K=inherit) (Z=inherit)
256 bonus System attributes
dnode flags: USED_BYTES USERUSED_ACCOUNTED
dnode maxblkid: 10
SA hdrsize 8
SA layout 2
path /xxxxxxxxx/path/name/removed/xxxxxxxx
uid 30000043e
gid 300000201
atime Tue Feb 7 21:46:05 2017
mtime Tue Feb 7 21:46:04 2017
ctime Tue Feb 7 21:46:04 2017
crtime Thu Jan 12 07:27:21 2017
gen 2208901
mode 40777
size 1106
parent 218703
links 2
pflags 40800000102
ndacl 8
skipping last half of zap_leaf_stats()
Fat ZAP stats:
Pointer table:
1024 elements
zt_blk: 0
zt_numblks: 0
zt_shift: 10
zt_blks_copied: 0
zt_nextblk: 0
ZAP entries: 1104
Leaf blocks: 10
Total blocks: 11
zap_block_type: 0x8000000000000001
zap_magic: 0x2f52ab2ab
zap_salt: 0x612e2b61f
Leafs with 2^n pointers:
6: 3 ***
7: 5 *****
8: 0
9: 2 **
Blocks with n*5 entries:
0: 2 **
1: 0
2: 0
3: 0
4: 0
5: 0
6: 0
7: 0
8: 0
9: 8 ********
Blocks n/10 full:
5: 3 ***
6: 0
7: 0
8: 0
9: 7 *******
Entries with n chunks:
Buckets with n entries:
Indirect blocks:
0 L1 4:5000031600:400 0:500001f600:400 4000L/400P F=9 B=26340/26340
0 L0 4:5000076e00:200 0:5000036000:200 4000L/200P F=1 B=26340/26340
c000 L0 3:1286cc6c00:1400 4:1401e2c800:1400 4000L/1400P F=1 B=6771/6771
10000 L0 3:1204d66000:1800 4:1401e2f400:1800 4000L/1800P F=1 B=6771/6771
14000 L0 3:12caebb800:1800 4:1401e2dc00:1800 4000L/1800P F=1 B=6771/6771
18000 L0 4:500006e000:1a00 0:500002c200:1a00 4000L/1a00P F=1 B=26336/26336
1c000 L0 3:12508ab400:1a00 4:1401e37600:1a00 4000L/1a00P F=1 B=6771/6771
20000 L0 4:5000077400:1a00 0:5000036600:1a00 4000L/1a00P F=1 B=26340/26340
24000 L0 3:1206777400:1400 4:1401e32600:1400 4000L/1400P F=1 B=6771/6771
28000 L0 3:1286cc8000:1400 4:1401e34800:1400 4000L/1400P F=1 B=6771/6771
segment [0000000000000000, 0000000000004000) size 16K
segment [000000000000c000, 000000000002c000) size 128K
Hmm, what exactly doesa hole in the ZAP directory "file" do?
This issue has been automatically marked as "stale" because it has not had any activity for a while. It will be closed in 90 days if no further activity occurs. Thank you for your contributions.
I believe the issue was actually due to corrupted memory, I ended up finding two faulty DIMMs (on a non-ECC desktop system) after running memtest. After replacing my bad memory I haven't had any corruption issues.
@johnramsden Awesome to hear you managed to fix your issues and thanks for checking out the issue so long after it was created :)
Most helpful comment
I have the exact same issue on FreeBSD ZFS.
The hung thread is spinning in zap_leaf_lookup_closest().
As @dweeezil stated in https://github.com/zfsonlinux/zfs/issues/4583 (which I
believe is the same issue), this is a case of a corrupted fat ZAP.
The zap corruption is a zeroed zap_leaf_phys_t{}.
If ZFS is compiled with debug then the zero zap_leaf_phys_t{} will hit the
following assert)() in zap_get_leaf_byblk():
ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF)
Without debug the readdir() will loop forever in zap_leaf_lookup_closest()
because with the zero zap_leaf_phys_t{}, le->le_next is always 0:
More details below from both kernels.
It's easy enough to avoid the infinite loop in zap_leaf_lookup_closest(),
but does anyone have any ideas on the root cause ?
Debug kernel details (hits assert() in zap_get_leaf_byblk())
12 assfail3 (a=, ...
13 zap_get_leaf_byblk (zap=, blkid=, tx=, lt=, lp=0xfffffe08b3f1f7d8)
14 zap_deref_leaf (zap=0xfffff8004b2b3800, h=0, tx=0x0, lt=RW_READER, lp=0xfffffe08b3f1f7d8)
15 fzap_cursor_retrieve (zap=0xfffff8004b2b3800, zc=0xfffffe08b3f1f7c8, za=0xfffffe08b3f1f6b0)
16 zap_cursor_retrieve (zc=0xfffffe08b3f1f7c8, za=0xfffffe08b3f1f6b0)
17 zfs_freebsd_readdir (ap=)
18 VOP_READDIR_APV (vop=, a=)
19 kern_getdirentries (td=0xfffff801274b7000, ...
20 sys_getdirentries (td=0xffffffff80b4e100, uap=0xfffffe08b3f1fa40)
from frame 13:,
(kdbg) set $l = (zap_leaf_t *)0xfffff8014cda100
(kgdb) p *$l
$2 = {
l_dbu = {
dbu_tqent = {
tqent_task = {
ta_link = {
stqe_next = 0x0
},
ta_pending = 0,
ta_priority = 0,
ta_func = 0,
ta_context = 0x0
},
tqent_func = 0,
tqent_arg = 0x0
},
dbu_evict_func = 0xffffffff811540b0
dbu_clear_on_evict_dbufp = 0xfffff8014cda1070
},
l_rwlock = {
lock_object = {
lo_name = 0xffffffff8121f01b "l->l_rwlock",
lo_flags = 40960000,
lo_data = 0,
lo_witness = 0x0
},
sx_lock = 17
},
l_blkid = 1,
l_bs = 14,
l_dbuf = 0xfffff8014c1d8000
}
(kgdb) p *$l->l_dbuf
$3 = {
db_object = 217487,
db_offset = 16384,
db_size = 16384,
db_data = 0xfffffe0015319000
}
(kgdb) set $zlp = (zap_leaf_phys_t *)0xfffffe0015319000
(kgdb) p *$zlp
$4 = {
l_hdr = {
lh_block_type = 0,
lh_pad1 = 0,
lh_prefix = 0,
lh_magic = 0,
lh_nfree = 0,
lh_nentries = 0,
lh_prefix_len = 0,
lh_freelist = 0,
lh_flags = 0 '0',
lh_pad2 = "000000000000000000000000000000"
},
l_hash = {0}
}
(kgdb) x/4096xw 0xfffffe0015319000
0xfffffe0015319000: 0x00000000 0x00000000 0x00000000 0x00000000
0xfffffe0015319010: 0x00000000 0x00000000 0x00000000 0x00000000
...
...
...
0xfffffe001531cff0: 0x00000000 0x00000000 0x00000000 0x00000000
From frame 14:,
(kgdb) p *zap
$5 = {
zap_dbu = {
dbu_tqent = {
tqent_task = {
ta_link = {
stqe_next = 0x0
},
ta_pending = 0,
ta_priority = 0,
ta_func = 0,
ta_context = 0x0
},
tqent_func = 0,
tqent_arg = 0x0
},
dbu_evict_func = 0xffffffff811583d0
dbu_clear_on_evict_dbufp = 0xfffff8004b2b3850
},
zap_objset = 0xfffff80020841000,
zap_object = 217487,
zap_dbuf = 0xfffff8014cd52310,
zap_rwlock = {
lock_object = {
lo_name = 0xffffffff81220390 "zap->zap_rwlock",
lo_flags = 40960000,
lo_data = 0,
lo_witness = 0x0
},
sx_lock = 17
},
zap_ismicro = 0,
zap_normflags = 0,
zap_salt = 26086651423,
zap_u = {
zap_fat = {
zap_num_entries_mtx = {
lock_object = {
lo_name = 0xffffffff812203a1 "zap->zap_u.zap_fat.zap_num_entries_mtx",
lo_flags = 40960000,
lo_data = 0,
lo_witness = 0x0
},
sx_lock = 1
},
zap_block_shift = 14
},
zap_micro = {
zap_num_entries = 929,
zap_num_chunks = -32478,
zap_alloc_next = -1,
zap_avl = {
avl_root = 0x2710000,
avl_compar = 0,
avl_offset = 1,
avl_numnodes = 14,
avl_size = 0
}
}
}
}
NON-Debug kernel details (spins in zap_leaf_lookup_closest()):
13 zap_leaf_lookup_closest (l=0xfffff801a1aa0500, h=0, cd=0, zeh=0xfffffe08b3a8f508)
14 0xffffffff80e2cf72 in fzap_cursor_retrieve (zap=0xfffff801a1a9a500, zc=0xfffffe08b3a8f7c8, za=0xfffffe08b3a8f6b0)
15 zap_cursor_retrieve (zc=0xfffffe08b3a8f7c8, za=0xfffffe08b3a8f6b0)
16 zfs_freebsd_readdir (ap=)
17 VOP_READDIR_APV (vop=, a=)
18 kern_getdirentries (td=0xfffff801a16a7490, ...
19 sys_getdirentries (td=0xffffffff80af1038, uap=0xfffffe08b3a8fa40)
Variable values from zap_leaf_lookup_closest():
Initial variable values:
leaf 0xfffff801a1aa0500, leaf phys 0xfffffe0019615000, h 0x0, cd 0x0, lh 0x0, bestlh 0x1ff
Inside the for (chunk = zap_leaf_phys(l)->l_hash[lh]; .... ) loop:
zap_leaf_lookup_closest/494: chunk 0x0, le 0xfffffe0019615430, le_next 0x0, le_hash 0x0, le_cd 0x0
zap_leaf_lookup_closest/499: h 0x0, cd 0x0, besth 0xffffffffffffffff, bestcd 0xffffffff
zap_leaf_lookup_closest/516: update bestlh 0x0, besth 0x0, bestcd, 0x0
zap_leaf_lookup_closest/494: chunk 0x0, le 0xfffffe0019615430, le_next 0x0, le_hash 0x0, le_cd 0x0
zap_leaf_lookup_closest/499: h 0x0, cd 0x0, besth 0x0, bestcd 0x0
zap_leaf_lookup_closest/516: update bestlh 0x0, besth 0x0, bestcd, 0x0
...
...
repeat the 494, 499, 516 (line numbers) .....