_This issue was originally opened by @shanehughes1990 as hashicorp/terraform#27363. It was migrated here as a result of the provider split. The original body of the issue is below._
Terraform v0.14.0
have also tried
Terraform v0.14.3
main.tf
module "gke_master" {
source = "app.terraform.io/Simplified/master/gke"
version = "v0.0.10"
# depends_on = [
# google_compute_network.network,
# google_compute_subnetwork.subnet,
# google_compute_shared_vpc_host_project.shared_vpc,
# google_compute_subnetwork_iam_member.subnet_iam_member,
# google_compute_shared_vpc_service_project.vpc_service_project,
# google_compute_router_nat.ps_nat,
# google_compute_address.nat
# ]
cluster_name = var.cluster_name
cluster_location = var.cluster_location
cluster_project_id = var.project_id
master_password = var.master_password
master_username = var.master_username
release_channel = "REGULAR"
# Required for private cluster
enable_private_cluster = false
enable_custom_networking = false
# network_name = google_compute_network.network.self_link
# subnet_name = google_compute_subnetwork.subnet.self_link
# pod_subnet_name = google_compute_subnetwork.subnet.secondary_ip_range[0].range_name
# services_subnet_name = google_compute_subnetwork.subnet.secondary_ip_range[1].range_name
}
ingress.tf
resource "kubernetes_namespace" "ingress" {
depends_on = [module.gke_default_pool]
metadata {
name = "ingress"
}
}
module "nginx_ingress" {
depends_on = [kubernetes_namespace.ingress]
source = "app.terraform.io/Simplified/nginx-ingress/helm"
version = "v0.0.25"
namespace = kubernetes_namespace.ingress.metadata[0].name
# Required for private cluster
enable_admission_webhooks = false
}
module "production_issuer" {
depends_on = [kubernetes_namespace.ingress, module.nginx_ingress]
source = "app.terraform.io/Simplified/cert-issuer/helm"
version = "v0.0.4"
name = "prod"
namespace = kubernetes_namespace.ingress.metadata[0].name
acme_email = "redacted"
helm_password = var.helm_password
project_id = var.hub_project_id
}
sqlproxy.tf
resource "kubernetes_namespace" "sql_proxy" {
depends_on = [module.gke_default_pool]
metadata {
name = "sqlproxy"
}
}
module "sql_proxy" {
depends_on = [kubernetes_namespace.sql_proxy]
source = "app.terraform.io/Simplified/sqlproxy/k8s"
version = "v0.0.1"
namespace = kubernetes_namespace.sql_proxy.metadata[0].name
gcp_project_id = "sasp-ca"
gcp_service_account_name = "ps-production-sqlproxy"
instance_name = "sasp-core-ca"
instance_region = "northamerica-northeast1"
}
providers.tf
provider "google" {
credentials = base64decode(var.gcp_json)
project = var.project_id
region = trimsuffix(var.cluster_location, "-a")
zone = var.cluster_location
}
provider "google-beta" {
credentials = base64decode(var.gcp_json)
project = var.project_id
region = trimsuffix(var.cluster_location, "-a")
zone = var.cluster_location
}
provider "kubernetes" {
load_config_file = false
host = module.gke_master.endpoint
token = module.gke_master.token
cluster_ca_certificate = base64decode(
module.gke_master.cluster_ca_certificate
)
}
provider "helm" {
kubernetes {
# load_config_file = false
host = module.gke_master.endpoint
token = module.gke_master.token
cluster_ca_certificate = base64decode(
module.gke_master.cluster_ca_certificate
)
}
}
versions.tf
terraform {
required_providers {
google = {
source = "hashicorp/google"
version = "3.51.0"
}
google-beta = {
source = "hashicorp/google-beta"
version = "3.51.0"
}
helm = {
source = "hashicorp/helm"
version = "2.0.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "1.13.2"
}
}
required_version = ">= 0.13"
}
module main.tf
data "google_client_config" "gcp_cluster" {
}
resource "google_container_cluster" "gcp_cluster" {
name = var.cluster_name
location = var.cluster_location
node_locations = var.additional_cluster_zones
project = var.cluster_project_id
release_channel {
channel = var.release_channel
}
# We can't create a cluster with no node pool defined, but we want to only use
# separately managed node pools. So we create the smallest possible default
# node pool and immediately delete it.
remove_default_node_pool = true
initial_node_count = 1
network = var.network_name
subnetwork = var.subnet_name
dynamic "ip_allocation_policy" {
for_each = var.enable_custom_networking == true ? [var.enable_custom_networking] : []
content {
cluster_secondary_range_name = var.pod_subnet_name
services_secondary_range_name = var.services_subnet_name
}
}
dynamic "private_cluster_config" {
for_each = var.enable_private_cluster == true ? [var.enable_private_cluster] : []
content {
enable_private_endpoint = false
enable_private_nodes = true
master_ipv4_cidr_block = var.master_cidr_range
}
}
master_auth {
username = var.master_username
password = var.master_password
client_certificate_config {
issue_client_certificate = true
}
}
timeouts {
create = var.cluster_create_timeout
delete = var.cluster_delete_timeout
}
maintenance_policy {
daily_maintenance_window {
start_time = var.maintenance_start_time
}
}
}
module outputs.tf
output "cluster_username" {
value = google_container_cluster.gcp_cluster.master_auth.0.username
}
output "cluster_password" {
value = google_container_cluster.gcp_cluster.master_auth.0.password
}
output "endpoint" {
value = google_container_cluster.gcp_cluster.endpoint
}
output "instance_group_urls" {
value = google_container_cluster.gcp_cluster.instance_group_urls
}
output "node_config" {
value = google_container_cluster.gcp_cluster.node_config
}
output "node_pools" {
value = google_container_cluster.gcp_cluster.node_pool
}
output "client_certificate" {
value = google_container_cluster.gcp_cluster.master_auth.0.client_certificate
}
output "client_key" {
value = google_container_cluster.gcp_cluster.master_auth.0.client_key
}
output "cluster_ca_certificate" {
value = google_container_cluster.gcp_cluster.master_auth.0.cluster_ca_certificate
}
output "token" {
value = data.google_client_config.gcp_cluster.access_token
}
module variables.tf
variable "cluster_name" {
description = "The name of the cluster, unique within the project and location"
}
variable "cluster_location" {
description = "The location (region or zone) in which the cluster master will be created, as well as the default node location"
}
variable "additional_cluster_zones" {
description = "The list of zones in which the cluster's nodes are located. Nodes must be in the region of their regional cluster or in the same region as their cluster's zone for zonal clusters"
type = list(string)
default = []
}
variable "cluster_project_id" {
description = "The ID of the project in which the resource belongs"
}
# Find release channels here
# https://cloud.google.com/kubernetes-engine/docs/concepts/release-channels
variable "release_channel" {
description = "GKE master update release channel"
}
variable "master_username" {
description = "The username to use for HTTP basic authentication when accessing the Kubernetes master endpoint"
}
variable "master_password" {
description = "The password to use for HTTP basic authentication when accessing the Kubernetes master endpoint"
}
variable "cluster_create_timeout" {
default = "30m"
}
variable "cluster_delete_timeout" {
default = "1h"
}
# Time is in GMT timezone
variable "maintenance_start_time" {
description = "The number of nodes that can be simultaneously unavailable during an upgrade, NOTE time is in GMT timezone"
default = "06:00"
}
variable "network_name" {
description = "name of the network to use"
default = "default"
}
variable "subnet_name" {
description = "name of the subnet to use"
default = "default"
}
variable "enable_custom_networking" {
description = "Enable custom networking"
type = bool
default = false
}
variable "enable_private_cluster" {
description = "Enable private cluster"
type = bool
default = false
}
variable "pod_subnet_name" {
description = "name of the pod subnet to use"
default = "pod-subnet"
}
variable "services_subnet_name" {
description = "name of the services subnet to use"
default = "services-subnet"
}
variable "master_cidr_range" {
description = "Master CIDR network"
default = "172.16.0.32/28"
}
https://gist.github.com/shanehughes1990/12f787bbcd7f22d2ca034e68195ce47e
After terraform has applied the config, everything comes up as expected, when attempting to change the enable_private_cluster to true, enable_custom_networking to true, or any other change in the gke_master module, terraform errors, when it should spit out a valid plan with the changes.
terraform errors on terraform plan
shanehughes@Ubuntu20Desktop:~/repos/clusters/terraform-ps-cluster$ terraform plan
module.gke_master.google_container_cluster.gcp_cluster: Refreshing state... [id=projects/parksmart-production/locations/northamerica-northeast1-a/clusters/cluster-production]
module.gke_default_pool.google_container_node_pool.gcp_cluster_nodes: Refreshing state... [id=projects/parksmart-production/locations/northamerica-northeast1-a/clusters/cluster-production/nodePools/default-pool]
kubernetes_namespace.ingress: Refreshing state... [id=ingress]
kubernetes_namespace.sql_proxy: Refreshing state... [id=sqlproxy]
Error: Get "http://localhost/api/v1/namespaces/sqlproxy": dial tcp 127.0.0.1:80: connect: connection refused
Error: Get "http://localhost/api/v1/namespaces/ingress": dial tcp 127.0.0.1:80: connect: connection refused
Hard for me to explain how to reproduce, as these are all private modules, but I guess try bringing up a cluster, some namespaces, and change something in the google_container_cluster resource and try terraform plan again.
This DOES NOT happen on terraform 0.13.5, terraform works as intended.
Currently testing on Ubuntu 20
This "will" be running in terraform cloud, (State is saved there), Was working out all the bugs locally before I let terraform cloud take over.
Running on GKE, RELEASE branch
Server Version: version.Info{Major:"1", Minor:"17+", GitVersion:"v1.17.13-gke.2600", GitCommit:"fc4bf3b03703b51c48ba123e8abc53b3051ba8a7", GitTreeState:"clean", BuildDate:"2020-11-11T09:20:10Z", GoVersion:"go1.13.15b4", Compiler:"gc", Platform:"linux/amd64"}
@shanehughes1990 this appears to be happening because the cluster is being configured in the same apply operation as your other resources (described in the docs here as an unstable setup - https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs#stacking-with-managed-kubernetes-cluster-resources)
So okay, because it has to delete the namespace first before recreating the master that's the cause of it? Why does it only happen on terraform 14 aswell, it works as I expect it too on 1.13.5. Is there a way around this as using terraform cloud in this sense does not work it fails? Would a destroy resources before changing gke setup be the solution?
@shanehughes1990 the difficulty you're describing with varying behaviour between versions is why we discourage having cluster configuration in the same apply as other resources. On your local machine, you could use -target to do more directed applies, but in terraform cloud you would need to configure run triggers in order to have this separation. We are aware of this general issue and tracking use cases and progress in https://github.com/hashicorp/terraform/issues/4149.
You can learn more on how to use run triggers by following the learn guide
I'll close this issue for now - please reopen if you continue to face difficulty.
Please re-open. This happens for us too. We were happily working away in TF13 with multiple applies, destroys, updates, etc. all working perfectly fine with Infrastructure and K8S and Helm in sub-modules with dependencies between one another in a composite module. We didn't have any issues. However, as soon as we moved to TF14 - boom! It all stopped working, and we get the exact same message as mentioned above.
I am afraid that targeted applies will greatly increase the deployment time for us - it does feel like a "go-to" response for these types of challenges in various bug reports, it would be great to simply get them resolved. Evidently, something has broken between TF13 and TF14.
It should also be noted that if we do a destroy, then everything works as expected and TF can successfully connect to K8S to destroy the required resources. This only occurs during a plan against already applied infrastructure.
Please re-open. This happens for us too. We were happily working away in TF13 with multiple applies, destroys, updates, etc. all working perfectly fine with Infrastructure and K8S and Helm in sub-modules with dependencies between one another in a composite module. We didn't have any issues. However, as soon as we moved to TF14 - boom! It all stopped working, and we get the exact same message as mentioned above.
I am afraid that targeted applies will greatly increase the deployment time for us - it does feel like a "go-to" response for these types of challenges in various bug reports, it would be great to simply get them resolved. Evidently, something has broken between TF13 and TF14.
It should also be noted that if we do a destroy, then everything works as expected and TF can successfully connect to K8S to destroy the required resources. This only occurs during a plan against already applied infrastructure.
We have since moved all in cluster resources to a different state and everything works as intended. That would be my suggestion to you aswell is just make sure you have no kubernetes resources in the same state as the resources to build the cluster
I'm going to lock this issue because it has been closed for _30 days_ ⏳. This helps our maintainers find and focus on the active issues.
If you feel this issue should be reopened, we encourage creating a new issue linking back to this one for added context. If you feel I made an error 🤖 🙉 , please reach out to my human friends 👉 [email protected]. Thanks!
Most helpful comment
We have since moved all in cluster resources to a different state and everything works as intended. That would be my suggestion to you aswell is just make sure you have no kubernetes resources in the same state as the resources to build the cluster