config:
core.https_address: '[::]:8443'
core.trust_password: true
api_extensions:
- storage_zfs_remove_snapshots
- container_host_shutdown_timeout
- container_stop_priority
- container_syscall_filtering
- auth_pki
- container_last_used_at
- etag
- patch
- usb_devices
- https_allowed_credentials
- image_compression_algorithm
- directory_manipulation
- container_cpu_time
- storage_zfs_use_refquota
- storage_lvm_mount_options
- network
- profile_usedby
- container_push
- container_exec_recording
- certificate_update
- container_exec_signal_handling
- gpu_devices
- container_image_properties
- migration_progress
- id_map
- network_firewall_filtering
- network_routes
- storage
- file_delete
- file_append
- network_dhcp_expiry
- storage_lvm_vg_rename
- storage_lvm_thinpool_rename
- network_vlan
- image_create_aliases
- container_stateless_copy
- container_only_migration
- storage_zfs_clone_copy
- unix_device_rename
- storage_lvm_use_thinpool
- storage_rsync_bwlimit
- network_vxlan_interface
- storage_btrfs_mount_options
- entity_description
- image_force_refresh
- storage_lvm_lv_resizing
- id_map_base
- file_symlinks
- container_push_target
- network_vlan_physical
- storage_images_delete
- container_edit_metadata
- container_snapshot_stateful_migration
- storage_driver_ceph
- storage_ceph_user_name
- resource_limits
- storage_volatile_initial_source
- storage_ceph_force_osd_reuse
- storage_block_filesystem_btrfs
- resources
- kernel_limits
- storage_api_volume_rename
- macaroon_authentication
- network_sriov
- console
- restrict_devlxd
- migration_pre_copy
- infiniband
- maas_network
- devlxd_events
- proxy
- network_dhcp_gateway
- file_get_symlink
- network_leases
- unix_device_hotplug
- storage_api_local_volume_handling
- operation_description
- clustering
- event_lifecycle
- storage_api_remote_volume_handling
- nvidia_runtime
- container_mount_propagation
- container_backup
- devlxd_images
- container_local_cross_pool_handling
- proxy_unix
- proxy_udp
- clustering_join
- proxy_tcp_udp_multi_port_handling
- network_state
- proxy_unix_dac_properties
- container_protection_delete
- unix_priv_drop
- pprof_http
- proxy_haproxy_protocol
- network_hwaddr
- proxy_nat
- network_nat_order
- container_full
- candid_authentication
- backup_compression
- candid_config
api_status: stable
api_version: "1.0"
auth: trusted
public: false
auth_methods:
- tls
environment:
addresses:
- 10.128.0.19:8443
- 10.0.0.1:8443
architectures:
- x86_64
- i686
certificate: |
-----BEGIN CERTIFICATE-----
MIIFVjCCAz6gAwIBAgIRALjTNSGkC8LBVbjd44xV3kAwDQYJKoZIhvcNAQELBQAw
OzEcMBoGA1UEChMTbGludXhjb250YWluZXJzLm9yZzEbMBkGA1UEAwwScm9vdEBs
eGQtMy1pbnN0YWxsMB4XDTE4MDkyNDIxMzIwN1oXDTI4MDkyMTIxMzIwN1owOzEc
MBoGA1UEChMTbGludXhjb250YWluZXJzLm9yZzEbMBkGA1UEAwwScm9vdEBseGQt
My1pbnN0YWxsMIICIjANBgkqhkiG9w0BAQEFAAOCAg8AMIICCgKCAgEA0LmOFzZY
nazSt2I+YvlOIYkNoWX+Ajq5g0SZslmOIZA07bVyQwrGY1Q3esHxN3ZzQGyr2UHb
LgjDJWlgXSyEkbYVWtN+xuNKKRilBnQFHrs350OcC7JaYvrEJVJ7ByPR8IZogofH
DLwIDkaRML01z2zAZd0rmnwW92lUuLhlsjrxEiJ4c6+pXENL++eqXjOXVIZSYp1X
TZi/lR2Ghk9SYF0pUGCSH3a6OSGB1PLCmfHkNVv5mZvbjQDGirD7yt3I1tzdVOCa
bquHJGYuPfxBqGsuA7RnER64jgVT2jw6zxd/jusabFbo7xi2X3Kc2+1FNB8kQK7e
96AThM1u7Ptu0gvAlcGTER5oMcAl72X5ySn3lSCG6qdtBtzicBNp7G0YCbTGcCGR
tOOj0gdP4sxE5PJ5xSOq6EL9bEZIRwkJpQ7xlscTyKBBjPBdPxwLc4rXwPyuGM6E
P0jcM/oxU5mQsBrazjDYq8RQmH0SGVraLiaB9BTL/W6w3bCXWlJdMUPhpjmCTOgN
L+Z+pACHIO+sriQB74mKGtBmsuQumOpS67C3jDEKYON7JUh6Y/Ap4cJDMqcUtdBO
Q6DpV3vfteoy3Ts6q47er2YUDrjgdUctu8Wiu29HWRuvyOg+UGtPKw1IIo5b2DgD
FKM/+XTpC/phfFMTvkymEos2B5UuW/vWLs0CAwEAAaNVMFMwDgYDVR0PAQH/BAQD
AgWgMBMGA1UdJQQMMAoGCCsGAQUFBwMBMAwGA1UdEwEB/wQCMAAwHgYDVR0RBBcw
FYINbHhkLTMtaW5zdGFsbIcECoAAFDANBgkqhkiG9w0BAQsFAAOCAgEAm8X8o28Y
7MTePDw5LLQC9rcHB+4cn+LK02FzLXpKCVRpdidphRBBHuLDUJ5lMfGvDmg0I+MY
XvXYFF1SK/HpEALTXkikxm4GOu9KCNm8h6k6QRPgX3vUqFVb6IkjYfOzEWWxz1l4
RmWAspynPQSU8nlI2dYL52J8OHckZWOqOG7LKNkeApQWn/VQCgbyVHD8pZR6l79H
He6tJlzaAtNFBRTlit0Xz83P/9P+q1BFEJwSBMqnvYKNZEpelnCMl+EUekg88OH+
Xz9qyGAlURrRM5FRdeuMADOViQYv8Ky6fgO3zngTuMMUsi9RrnIXtBA2b+O9rF15
ckShleHkdCg+Gn3f8iG7ruUAyaCkKFEm6PdI6Z8lgyG1N1gIhciX8eu32Mb8+Cl5
rkqhfPSsY3oMjyGufABVueCjYpsdQYyaG05AfU5kmiPgyv4tQpXZuicbfwvN5Pkm
E1Ar2eD0AIIiEv1RN/OK46jNbePkvvOyzN8wkt4nBJMZrJQDQ/9s8/216lINKv9C
WPpHpIkibEGzV30Z362pHYZhsinrLjQsDjE45SF18aJtSlP95AlM6x141B5sttjN
gVKhWewl2uHmJg/Eto8PKjPBwmpNWJbj9AWRkj0H8k5cUW4iS94UPW1+mEEu2TKv
88TkLly3tpZcm3qPOM+y+8ZRcV8fX7ngx3w=
-----END CERTIFICATE-----
certificate_fingerprint: <redacted>
driver: lxc
driver_version: 3.0.2
kernel: Linux
kernel_architecture: x86_64
kernel_version: 4.15.0-1019-gcp
server: lxd
server_pid: 1120
server_version: "3.5"
storage: zfs
storage_version: 0.7.5-1ubuntu16.3
server_clustered: false
server_name: container-images-1
I have a sequence of basic commands I am trying to execute against the container using the Hyperkit Ruby library (until we upgrade to pylxd...). These commands are very simple (creating a directory, unzipping a file, etc) and work in our current cluster running LXD 2.16, however, on LXD 3.5 I am seeing:
metadata:
context: {}
level: eror
message: 'Failed to retrieve PID of executing child process: EOF'
timestamp: "2018-09-24T21:56:19.394912046Z"
type: logging
in the monitor logs (note that this example's timestamp is off from the below examples as it was taken from a different run, but the error message is the same every time).
This seems to happen reliably but unpredictably, about 20% of the time.
This would be a pretty complex reproduction case. I could attempt to do with curl if necessary. The steps that are being followed though are:
exec endpointoperations endpoint to check if the operation is completeddmesg)This is all that's generated during the interations with the container:
[ 1459.708758] audit: type=1400 audit(1537827138.902:76): apparmor="STATUS" operation="profile_load" profile="unconfined" name="lxd-development-52bc90662885bbee-1537827135520_</var/snap/lxd/common/lxd>" pid=23075 comm="apparmor_parser"
[ 1459.721485] lxdbr0: port 12(veth741ACH) entered blocking state
[ 1459.721487] lxdbr0: port 12(veth741ACH) entered disabled state
[ 1459.721600] device veth741ACH entered promiscuous mode
[ 1459.721702] IPv6: ADDRCONF(NETDEV_UP): veth741ACH: link is not ready
[ 1459.721716] lxdbr0: port 12(veth741ACH) entered blocking state
[ 1459.721718] lxdbr0: port 12(veth741ACH) entered forwarding state
[ 1459.723542] lxdbr0: port 12(veth741ACH) entered disabled state
[ 1459.804872] eth0: renamed from vethSNSK6O
[ 1459.827466] IPv6: ADDRCONF(NETDEV_UP): eth0: link is not ready
[ 1459.828503] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
[ 1459.828556] IPv6: ADDRCONF(NETDEV_CHANGE): veth741ACH: link becomes ready
[ 1459.828621] lxdbr0: port 12(veth741ACH) entered blocking state
[ 1459.828623] lxdbr0: port 12(veth741ACH) entered forwarding state
[ 1459.890658] audit: type=1400 audit(1537827139.082:77): apparmor="DENIED" operation="mount" info="failed flags match" error=-13 profile="lxd-development-52bc90662885bbee-1537827135520_</var/snap/lxd/common/lxd>" name="/sys/fs/cgroup/unified/" pid=23080 comm="systemd" fstype="cgroup2" srcname="cgroup" flags="rw, nosuid, nodev, noexec"
[ 1459.890664] audit: type=1400 audit(1537827139.082:78): apparmor="DENIED" operation="mount" info="failed flags match" error=-13 profile="lxd-development-52bc90662885bbee-1537827135520_</var/snap/lxd/common/lxd>" name="/sys/fs/cgroup/unified/" pid=23080 comm="systemd" fstype="cgroup2" srcname="cgroup" flags="rw, nosuid, nodev, noexec"
[ 1460.038252] audit: type=1400 audit(1537827139.230:79): apparmor="DENIED" operation="mount" info="failed flags match" error=-13 profile="lxd-development-52bc90662885bbee-1537827135520_</var/snap/lxd/common/lxd>" name="/run/systemd/unit-root/run/lock/" pid=23234 comm="(networkd)" flags="ro, nosuid, nodev, noexec, remount, bind"
lxc info NAME --show-log)Note the last line:
Name: development-52bc90662885bbee-1537827135520
Remote: unix://
Architecture: x86_64
Created: 2018/09/24 22:12 UTC
Status: Running
Type: persistent
Profiles: default
Pid: 23080
Ips:
eth0: inet 10.0.87.57 veth741ACH
eth0: inet6 fe80::216:3eff:fe84:9013 veth741ACH
lo: inet 127.0.0.1
lo: inet6 ::1
Resources:
Processes: 14
Disk usage:
root: 5.59MB
CPU usage:
CPU usage (in seconds): 0
Memory usage:
Memory (current): 45.51MB
Memory (peak): 47.81MB
Network usage:
eth0:
Bytes received: 2.60kB
Bytes sent: 1.95kB
Packets received: 24
Packets sent: 14
lo:
Bytes received: 0B
Bytes sent: 0B
Packets received: 0
Packets sent: 0
Log:
lxc development-52bc90662885bbee-1537827135520 20180924221218.921 WARN conf - conf.c:lxc_map_ids:2917 - newuidmap binary is missing
lxc development-52bc90662885bbee-1537827135520 20180924221218.921 WARN conf - conf.c:lxc_map_ids:2923 - newgidmap binary is missing
lxc development-52bc90662885bbee-1537827135520 20180924221218.982 WARN conf - conf.c:lxc_map_ids:2917 - newuidmap binary is missing
lxc development-52bc90662885bbee-1537827135520 20180924221218.982 WARN conf - conf.c:lxc_map_ids:2923 - newgidmap binary is missing
lxc development-52bc90662885bbee-1537827135520 20180924221219.729 WARN conf - conf.c:lxc_setup_devpts:1618 - Invalid argument - Failed to unmount old devpts instance
lxc config show NAME --expanded)architecture: x86_64
config:
image.architecture: amd64
image.description: Ubuntu bionic amd64 (20180924_07:42)
image.os: Ubuntu
image.release: bionic
image.serial: "20180924_07:42"
volatile.base_image: 76c593807c13d36048f15390a17eb4dff8f0fef7e4eef9bdff83670570ff4afd
volatile.eth0.hwaddr: 00:16:3e:84:90:13
volatile.eth0.name: eth0
volatile.idmap.base: "0"
volatile.idmap.next: '[{"Isuid":true,"Isgid":true,"Hostid":1000000,"Nsid":0,"Maprange":1000000000}]'
volatile.last_state.idmap: '[{"Isuid":true,"Isgid":true,"Hostid":1000000,"Nsid":0,"Maprange":1000000000}]'
volatile.last_state.power: RUNNING
devices:
eth0:
nictype: bridged
parent: lxdbr0
type: nic
root:
path: /
pool: default
type: disk
ephemeral: false
profiles:
- default
stateful: false
description: ""
action=start created=2018-09-24T22:12:15+0000 ephemeral=false lvl=info msg="Starting container" name=development-52bc90662885bbee-1537827135520 stateful=false t=2018-09-24T22:12:18+0000 used=1970-01-01T00:00:00+0000
action=start created=2018-09-24T22:12:15+0000 ephemeral=false lvl=info msg="Started container" name=development-52bc90662885bbee-1537827135520 stateful=false t=2018-09-24T22:12:19+0000 used=1970-01-01T00:00:00+0000
lvl=eror msg="Failed to retrieve PID of executing child process: EOF" t=2018-09-24T22:12:26+0000
Run from Hyperkit, so all I get is a 400 error.
lxc monitor while reproducing the issue)Same as above, here again for reference:
metadata:
context: {}
level: eror
message: 'Failed to retrieve PID of executing child process: EOF'
timestamp: "2018-09-24T21:56:19.394912046Z"
type: logging
Can you show the entire lxc monitor output from the moment you start the lxc exec to the point where it's done?
@stgraber thanks for the insanely fast response. Here you go:
metadata:
context:
k: "0"
level: dbug
message: Found cert
timestamp: "2018-09-24T22:40:26.418467735Z"
type: logging
metadata:
context:
ip: 136.25.159.92:50356
method: POST
url: /1.0/containers/development-67858a0ce947e6a1-1537828807730/exec
level: dbug
message: handling
timestamp: "2018-09-24T22:40:26.418530318Z"
type: logging
metadata:
context: {}
level: dbug
message: 'forkcheckfile: Path doesn''t exist: No such file or directory'
timestamp: "2018-09-24T22:40:26.42752355Z"
type: logging
metadata:
context: {}
level: dbug
message: 'New task operation: 57687c0f-8fed-4f56-84b9-c5ab2cd2aae3'
timestamp: "2018-09-24T22:40:26.430710675Z"
type: logging
metadata:
context: {}
level: dbug
message: 'Started task operation: 57687c0f-8fed-4f56-84b9-c5ab2cd2aae3'
timestamp: "2018-09-24T22:40:26.430789355Z"
type: logging
metadata:
class: task
created_at: "2018-09-24T22:40:26.42757222Z"
description: Executing command
err: ""
id: 57687c0f-8fed-4f56-84b9-c5ab2cd2aae3
may_cancel: false
metadata: null
resources:
containers:
- /1.0/containers/development-67858a0ce947e6a1-1537828807730
status: Pending
status_code: 105
updated_at: "2018-09-24T22:40:26.42757222Z"
timestamp: "2018-09-24T22:40:26.43075796Z"
type: operation
metadata:
class: task
created_at: "2018-09-24T22:40:26.42757222Z"
description: Executing command
err: ""
id: 57687c0f-8fed-4f56-84b9-c5ab2cd2aae3
may_cancel: false
metadata: null
resources:
containers:
- /1.0/containers/development-67858a0ce947e6a1-1537828807730
status: Running
status_code: 103
updated_at: "2018-09-24T22:40:26.42757222Z"
timestamp: "2018-09-24T22:40:26.430801519Z"
type: operation
metadata:
context:
k: "0"
level: dbug
message: Found cert
timestamp: "2018-09-24T22:40:26.616982575Z"
type: logging
metadata:
context:
ip: 136.25.159.92:50358
method: GET
url: /1.0/operations/57687c0f-8fed-4f56-84b9-c5ab2cd2aae3/wait
level: dbug
message: handling
timestamp: "2018-09-24T22:40:26.617044666Z"
type: logging
metadata:
context: {}
level: eror
message: 'Failed to retrieve PID of executing child process: EOF'
timestamp: "2018-09-24T22:40:26.968266228Z"
type: logging
metadata:
context: {}
level: dbug
message: 'Updated metadata for task operation: 57687c0f-8fed-4f56-84b9-c5ab2cd2aae3'
timestamp: "2018-09-24T22:40:26.968356788Z"
type: logging
metadata:
context: {}
level: dbug
message: 'Failure for task operation: 57687c0f-8fed-4f56-84b9-c5ab2cd2aae3: EOF'
timestamp: "2018-09-24T22:40:26.968416833Z"
type: logging
metadata:
class: task
created_at: "2018-09-24T22:40:26.42757222Z"
description: Executing command
err: EOF
id: 57687c0f-8fed-4f56-84b9-c5ab2cd2aae3
may_cancel: false
metadata:
return: -1
resources:
containers:
- /1.0/containers/development-67858a0ce947e6a1-1537828807730
status: Failure
status_code: 400
updated_at: "2018-09-24T22:40:26.96834959Z"
timestamp: "2018-09-24T22:40:26.968427238Z"
type: operation
metadata:
class: task
created_at: "2018-09-24T22:40:26.42757222Z"
description: Executing command
err: ""
id: 57687c0f-8fed-4f56-84b9-c5ab2cd2aae3
may_cancel: false
metadata:
return: -1
resources:
containers:
- /1.0/containers/development-67858a0ce947e6a1-1537828807730
status: Running
status_code: 103
updated_at: "2018-09-24T22:40:26.96834959Z"
timestamp: "2018-09-24T22:40:26.968376032Z"
type: operation
Can you look at the content of /var/snap/lxd/common/lxd/logs/development-67858a0ce947e6a1-1537828807730/forkexec.log?
Also it looks like for this one I am waiting for the command to finish (not polling the operations endpoint). I've seen the same behavior with both approaches.
root@container-images-1:~# cat /var/snap/lxd/common/lxd/logs/development-67858a0ce947e6a1-1537828807730/forkexec.log
runtime stack:
runtime.throw(0x11259a6, 0x17)
/build/lxd/parts/go/build/src/runtime/panic.go:608 +0x72
runtime.netpoll(0x32202a49300, 0xdf8475800)
/build/lxd/parts/go/build/src/runtime/netpoll_epoll.go:75 +0x215
runtime.sysmon()
/build/lxd/parts/go/build/src/runtime/proc.go:4390 +0x41b
runtime.mstart1()
/build/lxd/parts/go/build/src/runtime/proc.go:1282 +0xe6
runtime.mstart()
/build/lxd/parts/go/build/src/runtime/proc.go:1248 +0x6e
goroutine 1 [runnable]:
gopkg.in/lxc/go-lxc%2ev2._Cfunc_setArrayString(0x28cecb0, 0x28ae7d0, 0x0)
_cgo_gotypes.go:1228 +0x45
gopkg.in/lxc/go-lxc%2ev2.makeNullTerminatedArgs.func1(0x28cecb0, 0x28ae7d0, 0x0)
/build/lxd/parts/lxd/go/src/gopkg.in/lxc/go-lxc.v2/util.go:61 +0x6a
gopkg.in/lxc/go-lxc%2ev2.makeNullTerminatedArgs(0xc0002ca040, 0x3, 0x4, 0x0)
/build/lxd/parts/lxd/go/src/gopkg.in/lxc/go-lxc.v2/util.go:61 +0x91
gopkg.in/lxc/go-lxc%2ev2.(*Container).RunCommandNoWait(0xc00025fad0, 0xc0002ca040, 0x3, 0x4, 0xffffffffffffffff, 0xffffffffffffffff, 0x7ffc8c2498fb, 0x5, 0xffffffffffffffff, 0xffffffffffffffff, ...)
/build/lxd/parts/lxd/go/src/gopkg.in/lxc/go-lxc.v2/container.go:1322 +0xef
main.(*cmdForkexec).Run(0xc0000c26d0, 0xc0002bc000, 0xc0002c20e0, 0xd, 0xe, 0x0, 0x0)
/build/lxd/parts/lxd/go/src/github.com/lxc/lxd/lxd/main_forkexec.go:124 +0x558
main.(*cmdForkexec).Run-fm(0xc0002bc000, 0xc0002c20e0, 0xd, 0xe, 0x0, 0x0)
/build/lxd/parts/lxd/go/src/github.com/lxc/lxd/lxd/main_forkexec.go:31 +0x52
github.com/spf13/cobra.(*Command).execute(0xc0002bc000, 0xc0002c2000, 0xe, 0xe, 0xc0002bc000, 0xc0002c2000)
/build/lxd/parts/lxd/go/src/github.com/spf13/cobra/command.go:762 +0x473
github.com/spf13/cobra.(*Command).ExecuteC(0xc00029b180, 0x1145de7, 0x0, 0xc0002bf900)
/build/lxd/parts/lxd/go/src/github.com/spf13/cobra/command.go:852 +0x2fd
github.com/spf13/cobra.(*Command).Execute(0xc00029b180, 0xc0002abec8, 0x1)
/build/lxd/parts/lxd/go/src/github.com/spf13/cobra/command.go:800 +0x2b
main.main()
/build/lxd/parts/lxd/go/src/github.com/lxc/lxd/lxd/main.go:164 +0xe13
goroutine 19 [syscall]:
os/signal.signal_recv(0x0)
/build/lxd/parts/go/build/src/runtime/sigqueue.go:139 +0x9c
os/signal.loop()
/build/lxd/parts/go/build/src/os/signal/signal_unix.go:23 +0x22
created by os/signal.init.0
/build/lxd/parts/go/build/src/os/signal/signal_unix.go:29 +0x41
Can you show snap info lxd?
root@container-images-1:~# snap info lxd
name: lxd
summary: System container manager and API
publisher: Canonical✓
contact: https://github.com/lxc/lxd/issues
license: unset
description: |
LXD is a container manager for system containers.
It offers a REST API to remotely manage containers over the network, using an image based workflow
and with support for live migration.
Images are available for all Ubuntu releases and architectures as well as for a wide number of
other Linux distributions.
LXD containers are lightweight, secure by default and a great alternative to virtual machines.
commands:
- lxd.benchmark
- lxd.buginfo
- lxd.check-kernel
- lxd.lxc
- lxd
- lxd.migrate
services:
lxd.daemon: simple, enabled, active
snap-id: J60k4JY0HppjwOjW8dZdYc8obXKxujRu
tracking: stable
refresh-date: today at 21:32 UTC
channels:
stable: 3.5 (8774) 66MB -
candidate: 3.5 (8774) 66MB -
beta: ↑
edge: git-efc73da (8791) 66MB -
3.0/stable: 3.0.2 (8715) 65MB -
3.0/candidate: 3.0.2 (8715) 65MB -
3.0/beta: ↑
3.0/edge: git-1722ea8 (8801) 65MB -
2.0/stable: 2.0.11 (8023) 28MB -
2.0/candidate: 2.0.11 (8023) 28MB -
2.0/beta: ↑
2.0/edge: git-92a4fdc (8000) 26MB -
installed: 3.5 (8774) 66MB -
Ok, can you give me the exact query being sent to LXD?
Looks like a problem with the command or arguments being parsed inside go-lxc.
I know Hyperkit isn't officially supported, but a request to the LXD API is being made here:
https://github.com/jeffshantz/hyperkit/blob/master/lib/hyperkit/client/containers.rb#L315
We're not using any of the option params (except occasionally sync, as mentioned).
The Ruby object we're passing in is an array, a la ['mkdir', '-p', '/src/codevolve'].
Let me know if this isn't enough and I can try to extract it some other way.
@brauner if still lurking around, maybe you'll spot something obviously bad in the go-lxc code ^
@saulcosta I'll see if I can reproduce it on a system here
@saulcosta no luck so far, so if you can reproduce it with curl or lxc query, that'd be great
Alternatively I can give you a custom LXD binary you can sideload on your system and that'd hopefully record more stuff.
@stgraber I'm able to with:
root@container-images-1:~# lxc launch ubuntu:
Creating the container
Container name is: integral-arachnid
Starting integral-arachnid
root@container-images-1:~# lxc query -X POST /1.0/containers/integral-arachnid/exec -d '{ "command": ["echo", "foo"] }' --wait
{
"class": "task",
"created_at": "2018-09-24T23:17:47.223217865Z",
"description": "Executing command",
"err": "",
"id": "855112be-c233-405c-8868-f030ae397848",
"may_cancel": false,
"metadata": {
"return": 0
},
"resources": {
"containers": [
"/1.0/containers/integral-arachnid"
]
},
"status": "Success",
"status_code": 200,
"updated_at": "2018-09-24T23:17:47.28828152Z"
}
root@container-images-1:~# lxc query -X POST /1.0/containers/integral-arachnid/exec -d '{ "command": ["echo", "foo"] }' --wait
{
"class": "task",
"created_at": "2018-09-24T23:17:51.775155133Z",
"description": "Executing command",
"err": "",
"id": "90a1073d-fda5-42fa-9a92-63b22efb32f8",
"may_cancel": false,
"metadata": {
"return": 0
},
"resources": {
"containers": [
"/1.0/containers/integral-arachnid"
]
},
"status": "Success",
"status_code": 200,
"updated_at": "2018-09-24T23:17:51.831319275Z"
}
root@container-images-1:~# lxc query -X POST /1.0/containers/integral-arachnid/exec -d '{ "command": ["echo", "foo"] }' --wait
Error: EOF
Note that it took a few times, this is consistent with what I've seen thus far.
Yeah, I can't reproduce it all the time, but with your reproducer, I do get it some of the time at least.
That should be enough to figure out what's going on, thanks.
This looks like a 0 size parameter is passed to calloc in makeCharArray and then 0 is used as index into the unallocated array
The calloc fix from @brauner in go-lxc fixed one of the issues in this codepath, but I'm unfortunately still seeing very frequent crashes, they now look like:
root@lantea:~# lxc query -X POST /1.0/containers/c1/exec -d '{ "command": ["echo", "foo"]}' --wait
Error: EOF
root@lantea:~# cat /var/snap/lxd/common/lxd/logs/c1/forkexec.log
runtime stack:
runtime.throw(0x10ddfdf, 0x17)
/usr/lib/go-1.10/src/runtime/panic.go:616 +0x81
runtime.netpoll(0x6f7dbab3f00, 0xdf8475800)
/usr/lib/go-1.10/src/runtime/netpoll_epoll.go:75 +0x216
runtime.sysmon()
/usr/lib/go-1.10/src/runtime/proc.go:4265 +0x424
runtime.mstart1(0x0)
/usr/lib/go-1.10/src/runtime/proc.go:1227 +0xe7
runtime.mstart()
/usr/lib/go-1.10/src/runtime/proc.go:1193 +0x76
goroutine 1 [runnable]:
gopkg.in/lxc/go-lxc%2ev2._cgo_cmalloc(0x6, 0x33b6f20)
_cgo_gotypes.go:1246 +0x4a
gopkg.in/lxc/go-lxc%2ev2._Cfunc_CString(0x7fff6077f8ff, 0x5, 0x33b6e50)
_cgo_gotypes.go:223 +0x33
gopkg.in/lxc/go-lxc%2ev2.(*Container).RunCommandNoWait(0xc4202cf380, 0xc4203080a0, 0x2, 0x2, 0xffffffffffffffff, 0xffffffffffffffff, 0x7fff6077f8ff, 0x5, 0xffffffffffffffff, 0xffffffffffffffff, ...)
/home/stgraber/data/code/go/src/gopkg.in/lxc/go-lxc.v2/container.go:1340 +0x232
main.(*cmdForkexec).Run(0xc4201cc1d8, 0xc4202f7400, 0xc4202192b0, 0xc, 0xd, 0x0, 0x0)
/home/stgraber/data/code/go/src/github.com/lxc/lxd/lxd/main_forkexec.go:124 +0x963
main.(*cmdForkexec).Run-fm(0xc4202f7400, 0xc4202192b0, 0xc, 0xd, 0x0, 0x0)
/home/stgraber/data/code/go/src/github.com/lxc/lxd/lxd/main_forkexec.go:31 +0x52
github.com/spf13/cobra.(*Command).execute(0xc4202f7400, 0xc420219040, 0xd, 0xd, 0xc4202f7400, 0xc420219040)
/home/stgraber/data/code/go/src/github.com/spf13/cobra/command.go:762 +0x468
github.com/spf13/cobra.(*Command).ExecuteC(0xc4202f6780, 0x0, 0xc420304f00, 0xc420304f00)
/home/stgraber/data/code/go/src/github.com/spf13/cobra/command.go:852 +0x30a
github.com/spf13/cobra.(*Command).Execute(0xc4202f6780, 0xc4202f1e00, 0x1)
/home/stgraber/data/code/go/src/github.com/spf13/cobra/command.go:800 +0x2b
main.main()
/home/stgraber/data/code/go/src/github.com/lxc/lxd/lxd/main.go:164 +0xea3
goroutine 5 [syscall]:
os/signal.signal_recv(0x0)
/usr/lib/go-1.10/src/runtime/sigqueue.go:139 +0xa6
os/signal.loop()
/usr/lib/go-1.10/src/os/signal/signal_unix.go:22 +0x22
created by os/signal.init.0
/usr/lib/go-1.10/src/os/signal/signal_unix.go:28 +0x41
@stgraber @brauner thanks for handling this so quickly. Presumably I'll need to either build from source or wait for the next bugfix release to get access to this?
I plan on cherry-picking this fix later this week, so you should just get a new snap build soon with a fix.