Is this a bug report or feature request?
Deviation from expected behavior:
On a fresh helm-based deployment of v1.4.3, ceph-operator pod logs the following:
2020-09-15 03:30:36.303110 E | op-mgr: failed to enable service monitor. service monitor could not be enabled: failed to retrieve servicemonitor. servicemonitors.monitoring.coreos.com "rook-ceph-mgr" is forbidden: User "system:serviceaccount:rook-ceph:rook-ceph-system" cannot get resource "servicemonitors" in API group "monitoring.coreos.com" in the namespace "rook-ceph": RBAC: clusterrole.rbac.authorization.k8s.io "rook-ceph-system-psp-user" not found
W0915 03:30:36.314350 7 client_config.go:543] Neither --kubeconfig nor --master was specified. Using the inClusterConfig. This might not work.
2020-09-15 03:30:36.317293 E | op-mgr: failed to deploy prometheus rule. prometheus rule could not be deployed: failed to create prometheusRules. prometheusrules.monitoring.coreos.com is forbidden: User "system:serviceaccount:rook-ceph:rook-ceph-system" cannot create resource "prometheusrules" in API group "monitoring.coreos.com" in the namespace "rook-ceph": RBAC: clusterrole.rbac.authorization.k8s.io "rook-ceph-system-psp-user" not found
ServiceMonitors are evidently not created
Expected behavior:
ServiceMonitors are created, error logs are not produced by the operator
How to reproduce it (minimal and precise):
File(s) to submit:
CR:
root@cn1:~# kubectl get cephclusters.ceph.rook.io -n rook-ceph rook-ceph -o yaml
apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"ceph.rook.io/v1","kind":"CephCluster","metadata":{"annotations":{},"name":"rook-ceph","namespace":"rook-ceph"},"spec":{"cephVersion":{"image":"registry-internal.elpenguino.net/library/ceph:v15.2.4-20200819"},"dashboard":{"enabled":true},"dataDirHostPath":"/var/lib/rook","mon":{"allowMultiplePerNode":false,"count":3},"monitoring":{"enabled":true,"rulesNamespace":"rook-ceph"},"network":{"hostNetwork":true},"storage":{"config":{"databaseSizeMB":"1024","journalSizeMB":"1024"},"deviceFilter":"^sdb$","nodes":[{"devices":[{"name":"disk/by-path/pci-0000:00:1f.2-ata-3.0"}],"name":"wn1"},{"devices":[{"name":"disk/by-path/pci-0000:00:1f.2-ata-3.0"}],"name":"wn2"},{"devices":[{"name":"disk/by-path/pci-0000:00:1f.2-ata-3.0"}],"name":"wn3"},{"devices":[{"name":"disk/by-path/pci-0000:00:1f.2-ata-3.0"}],"name":"wn4"}],"resources":{"mgr":{"limits":{"cpu":"2000m","memory":"1Gi"},"modules":[{"enabled":true,"name":"pg_autoscaler"}],"requests":{"cpu":"2000m","memory":"1Gi"}},"mon":{"limits":{"cpu":"2000m","memory":"1Gi"},"requests":{"cpu":"2000m","memory":"1Gi"}},"osd":{"requests":{"cpu":"2000m","memory":"4Gi"}}},"useAllDevices":false,"useAllNodes":true}}}
creationTimestamp: "2020-09-15T03:11:34Z"
finalizers:
- cephcluster.ceph.rook.io
generation: 2
managedFields:
- apiVersion: ceph.rook.io/v1
fieldsType: FieldsV1
fieldsV1:
f:metadata:
f:annotations:
.: {}
f:kubectl.kubernetes.io/last-applied-configuration: {}
f:spec:
.: {}
f:cephVersion:
.: {}
f:image: {}
f:dashboard:
.: {}
f:enabled: {}
f:dataDirHostPath: {}
f:mon:
.: {}
f:count: {}
f:monitoring:
.: {}
f:enabled: {}
f:rulesNamespace: {}
f:network:
.: {}
f:hostNetwork: {}
f:storage:
.: {}
f:config:
.: {}
f:databaseSizeMB: {}
f:journalSizeMB: {}
f:deviceFilter: {}
f:useAllDevices: {}
f:useAllNodes: {}
manager: kubectl
operation: Update
time: "2020-09-15T03:11:34Z"
- apiVersion: ceph.rook.io/v1
fieldsType: FieldsV1
fieldsV1:
f:metadata:
f:finalizers:
.: {}
v:"cephcluster.ceph.rook.io": {}
f:spec:
f:cleanupPolicy:
.: {}
f:sanitizeDisks: {}
f:crashCollector:
.: {}
f:disable: {}
f:disruptionManagement: {}
f:external:
.: {}
f:enable: {}
f:healthCheck:
.: {}
f:daemonHealth:
.: {}
f:mon: {}
f:osd: {}
f:status: {}
f:mgr: {}
f:network:
f:provider: {}
f:selectors: {}
f:removeOSDsIfOutAndSafeToRemove: {}
f:storage:
f:nodes: {}
f:storageClassDeviceSets: {}
f:status:
.: {}
f:ceph:
.: {}
f:health: {}
f:lastChanged: {}
f:lastChecked: {}
f:previousHealth: {}
f:conditions: {}
f:message: {}
f:phase: {}
f:state: {}
f:storage:
.: {}
f:deviceClasses: {}
f:version:
.: {}
f:image: {}
f:version: {}
manager: rook
operation: Update
time: "2020-09-15T03:54:45Z"
name: rook-ceph
namespace: rook-ceph
resourceVersion: "42872"
selfLink: /apis/ceph.rook.io/v1/namespaces/rook-ceph/cephclusters/rook-ceph
uid: 4d60e7ea-50d6-4829-9c06-690406c7fec7
spec:
cephVersion:
image: registry-internal.elpenguino.net/library/ceph:v15.2.4-20200819
cleanupPolicy:
sanitizeDisks: {}
crashCollector:
disable: false
dashboard:
enabled: true
dataDirHostPath: /var/lib/rook
disruptionManagement: {}
external:
enable: false
healthCheck:
daemonHealth:
mon: {}
osd: {}
status: {}
mgr: {}
mon:
count: 3
monitoring:
enabled: true
rulesNamespace: rook-ceph
network:
hostNetwork: true
provider: ""
selectors: null
removeOSDsIfOutAndSafeToRemove: false
storage:
config:
databaseSizeMB: "1024"
journalSizeMB: "1024"
deviceFilter: ^sdb$
nodes:
- config: null
devices:
- config: null
name: disk/by-path/pci-0000:00:1f.2-ata-3.0
name: wn1
resources: {}
- config: null
devices:
- config: null
name: disk/by-path/pci-0000:00:1f.2-ata-3.0
name: wn2
resources: {}
- config: null
devices:
- config: null
name: disk/by-path/pci-0000:00:1f.2-ata-3.0
name: wn3
resources: {}
- config: null
devices:
- config: null
name: disk/by-path/pci-0000:00:1f.2-ata-3.0
name: wn4
resources: {}
storageClassDeviceSets: null
useAllDevices: false
useAllNodes: true
status:
ceph:
health: HEALTH_OK
lastChanged: "2020-09-15T03:31:15Z"
lastChecked: "2020-09-15T03:54:45Z"
previousHealth: HEALTH_ERR
conditions:
- lastHeartbeatTime: "2020-09-15T03:31:05Z"
lastTransitionTime: "2020-09-15T03:12:53Z"
message: Cluster progression is completed
reason: ProgressingCompleted
status: "False"
type: Progressing
- lastHeartbeatTime: "2020-09-15T03:15:54Z"
lastTransitionTime: "2020-09-15T03:15:54Z"
message: Cluster created successfully
reason: ClusterCreated
status: "True"
type: Ready
- lastHeartbeatTime: "2020-09-15T03:15:55Z"
lastTransitionTime: "2020-09-15T03:15:55Z"
message: Cluster progression is completed
reason: ProgressingCompleted
status: "False"
- lastHeartbeatTime: "2020-09-15T03:30:12Z"
lastTransitionTime: "2020-09-15T03:30:12Z"
message: Failed to configure ceph cluster
reason: ClusterFailure
status: "True"
type: Failure
message: Cluster created successfully
phase: Ready
state: Created
storage:
deviceClasses:
- name: ssd
version:
image: registry-internal.elpenguino.net/library/ceph:v15.2.4-20200819
version: 15.2.4-0
Environment:
uname -a): 5.8.9-1.el7.elrepo.x86_64rook version inside of a Rook Pod): 1.4.3ceph -v): kubectl version): 1.18ceph health in the Rook Ceph toolbox):Have you injected cluster/examples/kubernetes/ceph/monitoring/rbac.yaml?
I've not :) I expected the helm chart to do that. Also, the string rook-ceph-system-psp-user as reported by the operator is not found in https://github.com/rook/rook/blob/master/cluster/examples/kubernetes/ceph/monitoring/rbac.yaml...
same on v1.4.4
successfully workaround by creating role and role binding
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: ceph-prometheus
namespace: rook-ceph
rules:
- apiGroups:
- monitoring.coreos.com
resources:
- servicemonitors
- prometheusrules
verbs:
- '*'
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: ceph-prometheus
namespace: rook-ceph
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: ceph-prometheus
subjects:
- kind: ServiceAccount
name: rook-ceph-system
namespace: rook-ceph