20.03.1
This is the latest release
K8s running on 3 M5.xLarge (4 vcpu/16 gb ram)
K8s yaml for cluster setup
apiVersion: v1
kind: Service
metadata:
name: dgraph-alpha-public
labels:
app: dgraph-alpha
monitor: alpha-dgraph-io
spec:
type: LoadBalancer
ports:
- port: 8080
targetPort: 8080
name: alpha-http
- port: 9080
targetPort: 9080
name: alpha-grpc
selector:
app: dgraph-alpha
---
apiVersion: v1
kind: Service
metadata:
name: dgraph-alpha-private
annotations:
service.beta.kubernetes.io/aws-load-balancer-internal: "true"
labels:
app: dgraph-alpha
monitor: alpha-dgraph-io
spec:
type: LoadBalancer
ports:
- port: 8080
targetPort: 8080
name: alpha-http
- port: 9080
targetPort: 9080
name: alpha-grpc
selector:
app: dgraph-alpha
# ---
# # This service is created in-order to debug & profile a specific alpha.
# # You can create one for each alpha that you need to profile.
# # For a more general HTTP APIs use the above service instead.
# apiVersion: v1
# kind: Service
# metadata:
# name: dgraph-alpha-0-http-public
# labels:
# app: dgraph-alpha
# spec:
# type: LoadBalancer
# ports:
# - port: 8080
# targetPort: 8080
# name: alpha-http
# selector:
# statefulset.kubernetes.io/pod-name: dgraph-alpha-0
---
apiVersion: v1
kind: Service
metadata:
name: dgraph-ratel-public
labels:
app: dgraph-ratel
spec:
type: LoadBalancer
ports:
- port: 8000
targetPort: 8000
name: ratel-http
selector:
app: dgraph-ratel
---
# This is a headless service which is necessary for discovery for a dgraph-zero StatefulSet.
# https://kubernetes.io/docs/tutorials/stateful-application/basic-stateful-set/#creating-a-statefulset
apiVersion: v1
kind: Service
metadata:
name: dgraph-zero
labels:
app: dgraph-zero
spec:
ports:
- port: 5080
targetPort: 5080
name: zero-grpc
clusterIP: None
# We want all pods in the StatefulSet to have their addresses published for
# the sake of the other Dgraph Zero pods even before they're ready, since they
# have to be able to talk to each other in order to become ready.
publishNotReadyAddresses: true
selector:
app: dgraph-zero
---
# This is a headless service which is necessary for discovery for a dgraph-alpha StatefulSet.
# https://kubernetes.io/docs/tutorials/stateful-application/basic-stateful-set/#creating-a-statefulset
apiVersion: v1
kind: Service
metadata:
name: dgraph-alpha
labels:
app: dgraph-alpha
spec:
ports:
- port: 7080
targetPort: 7080
name: alpha-grpc-int
clusterIP: None
# We want all pods in the StatefulSet to have their addresses published for
# the sake of the other Dgraph alpha pods even before they're ready, since they
# have to be able to talk to each other in order to become ready.
publishNotReadyAddresses: true
selector:
app: dgraph-alpha
---
# This StatefulSet runs 3 Dgraph Zero.
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: dgraph-zero
spec:
serviceName: "dgraph-zero"
replicas: 3
selector:
matchLabels:
app: dgraph-zero
template:
metadata:
labels:
app: dgraph-zero
spec:
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values: ["dgraph-zero"]
topologyKey: kubernetes.io/hostname
containers:
- name: zero
image: dgraph/dgraph:v20.03.1
imagePullPolicy: IfNotPresent
ports:
- containerPort: 5080
name: zero-grpc
- containerPort: 6080
name: zero-http
volumeMounts:
- name: datadir
mountPath: /dgraph
env:
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: GODEBUG
value: madvdontneed=1
command:
- bash
- "-c"
- |
set -ex
[[ `hostname` =~ -([0-9]+)$ ]] || exit 1
ordinal=${BASH_REMATCH[1]}
idx=$(($ordinal + 1))
if [[ $ordinal -eq 0 ]]; then
exec dgraph zero --my=$(hostname -f):5080 --idx $idx --replicas 3 -v=2
else
exec dgraph zero --my=$(hostname -f):5080 --peer dgraph-zero-0.dgraph-zero.${POD_NAMESPACE}.svc.cluster.local:5080 --idx $idx --replicas 3 -v=2
fi
livenessProbe:
httpGet:
path: /health
port: 6080
initialDelaySeconds: 15
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
readinessProbe:
httpGet:
path: /state
port: 6080
initialDelaySeconds: 15
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
terminationGracePeriodSeconds: 60
volumes:
- name: datadir
persistentVolumeClaim:
claimName: datadir
updateStrategy:
type: RollingUpdate
volumeClaimTemplates:
- metadata:
name: datadir
annotations:
volume.alpha.kubernetes.io/storage-class: anything
spec:
accessModes:
- "ReadWriteOnce"
resources:
requests:
storage: 5Gi
---
# This StatefulSet runs 3 replicas of Dgraph Alpha.
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: dgraph-alpha
spec:
serviceName: "dgraph-alpha"
replicas: 3
selector:
matchLabels:
app: dgraph-alpha
template:
metadata:
labels:
app: dgraph-alpha
spec:
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values: ["dgraph-alpha"]
topologyKey: kubernetes.io/hostname
containers:
- name: alpha
image: dgraph/dgraph:v20.03.1
imagePullPolicy: IfNotPresent
ports:
- containerPort: 7080
name: alpha-grpc-int
- containerPort: 8080
name: alpha-http
- containerPort: 9080
name: alpha-grpc
volumeMounts:
- name: datadir
mountPath: /dgraph
resources:
requests:
memory: "2Gi"
cpu: "1"
limits:
memory: "12Gi"
cpu: "2"
env:
# This should be the same namespace as the dgraph-zero
# StatefulSet to resolve a Dgraph Zero's DNS name for
# Alpha's --zero flag.
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: GODEBUG
value: madvdontneed=1
command:
- bash
- "-c"
- |
set -ex
dgraph alpha --my=$(hostname -f):7080 --lru_mb 2048 --zero dgraph-zero-0.dgraph-zero.${POD_NAMESPACE}.svc.cluster.local:5080 -v=2
livenessProbe:
httpGet:
path: /health?live=1
port: 8080
initialDelaySeconds: 15
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 15
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
successThreshold: 1
terminationGracePeriodSeconds: 600
volumes:
- name: datadir
persistentVolumeClaim:
claimName: datadir
updateStrategy:
type: RollingUpdate
volumeClaimTemplates:
- metadata:
name: datadir
annotations:
volume.alpha.kubernetes.io/storage-class: anything
spec:
accessModes:
- "ReadWriteOnce"
resources:
requests:
storage: 100Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: dgraph-ratel
labels:
app: dgraph-ratel
spec:
selector:
matchLabels:
app: dgraph-ratel
template:
metadata:
labels:
app: dgraph-ratel
spec:
containers:
- name: ratel
image: dgraph/dgraph:v20.03.1
ports:
- containerPort: 8000
command:
- dgraph-ratel
Hitting the Alpha load balancer with ~25-75 mutations/sec to ingest data into the graph or really any consistent flow of data into the alpha nodes
Expected behaviour:
Actual Result:

Continual cycle of OOM kill and crash loop backoff causing restarting

Image of the Pod cycling through OOM errors

Pprof logs do not show the same memory restraints, and only show 2GB used even when container is dying.
Pprof inuse_objects show a rather high amount of objects though.
I believe the issue lies with lack of GC, or potentially a Memory leak within the Alpha pods
pprof.dgraph.alloc_objects.alloc_space.inuse_objects.inuse_space.009.pb.gz
pprof.dgraph.samples.cpu.001.pb.gz
This issue is Blocking our team, so any help would be greatly appreciated
Adding some additional Pprof logs.
Here are 2 taken back to back with memory jumps from ~3gb to ~5gb within a couple seconds. Right after that the pod OOM crashed
pprof.dgraph.alloc_objects.alloc_space.inuse_objects.inuse_space.014.pb.gz
pprof.dgraph.alloc_objects.alloc_space.inuse_objects.inuse_space.015.pb.gz
You can see resolve.squashFragments jump from 2.21gb to 4gb in about 1 second. Then the whole pod crashes


Some additional information after seeing the following issue:
https://github.com/dgraph-io/dgraph/issues/5315
I am running ~90 mutations/GQL mutation block. Each mutation is deeply nested to 2 levels deep.
Update:
I have since dropped the total mutation block size to 10 mutations/gql block and still running through all of the memory
Hi,
Here is most of my schema plus a sample of the type of mutations I'm doing. The mutation is dynamic based on the streaming input data that we get so it could change quite a bit, but this is probably the most common one.
type Workspace {
workspaceId: String! @search(by: [hash]) @id
workspaceName: String
}
interface Id {
key: String! @search(by: [hash]) @id
onWorkspace: [Workspace]!
hasTraits: [Traits]
hasGroupTraits: [GroupTraits]
}
type AnonymousId implements Id {
email: [Email] @hasInverse(field: anonymousId)
userId: [UserId] @hasInverse(field: anonymousId)
hasExperiment: [Experiment]
}
type Email implements Id {
anonymousId: [AnonymousId] @hasInverse(field: email)
userId: [UserId] @hasInverse(field: email)
}
type UserId implements Id {
anonymousId: [AnonymousId] @hasInverse(field: userId)
email: [Email] @hasInverse (field: userId)
}
type Traits {
id: String! @search(by: [hash]) @id
traitBlob: String! @search(by: [regexp])
integration: String! @search(by: [term])
onWorkspace: [Workspace!]!
createdOn: DateTime!
}
Mutation:
upd2: updateAnonymousId(input: {
filter: {key: {eq: "NewTest2"}},
set: { onWorkspace: [{workspaceId:"testWorkspace"}],
hasTraits: [{id: "%7B%22TEST%22%3A%22test%22%7D:CLIENT:testWorkspace", integration: "${integration}", traitBlob: "%7B%22TEST%22%3A%22test%22%7D", createdOn: "2020-05-11T17:37:40.664Z", onWorkspace: [{workspaceId:"testWorkspace"}]}],
userId: [{
onWorkspace: [key: "testUser", {workspaceId:"testWorkspace"}],
hasTraits: [{id: "%7B%22TEST%22%3A%22test%22%7D:CLIENT:testWorkspace", integration: "${integration}", traitBlob: "%7B%22TEST%22%3A%22test%22%7D", createdOn: "2020-05-11T17:37:40.664Z", onWorkspace: [{workspaceId:"testWorkspace"}]}]
}]
}
}){
anonymousid{
key
}
Hi @guhan-v I think I mentioned in slack that we were trying to fix this for 20.03.2 release. It didn't make it into that because that had an urgent release. We'll try and get it out in 20.03.3 asap.
Gotcha @MichaelJCompton ,
Thanks for the heads up. Do you happen to know if there is a schedule for that release?
Just want to plan my next sprint appropriately
I'll touch base about a release when it firms up. There's a couple of changes heading for 20.03.3
This fix should be done by early next week.
Sounds good thanks for the heads up
Hey, sorry for the delay. We have merged a solution in the release/v20.03 branch. Upon running benchmarks in the new release branch, we can see significant improvement.
Would you be able to check if the solution works for you?
Most helpful comment
Hi @guhan-v I think I mentioned in slack that we were trying to fix this for 20.03.2 release. It didn't make it into that because that had an urgent release. We'll try and get it out in 20.03.3 asap.