Terraform-provider-kubernetes: terraform plan - dial tcp 127.0.0.1:80: connect: connection refused

Created on 26 Dec 2020  ·  8Comments  ·  Source: hashicorp/terraform-provider-kubernetes

_This issue was originally opened by @shanehughes1990 as hashicorp/terraform#27363. It was migrated here as a result of the provider split. The original body of the issue is below._


Terraform Version

Terraform v0.14.0

have also tried

Terraform v0.14.3

Terraform Configuration Files

main.tf

module "gke_master" {
  source  = "app.terraform.io/Simplified/master/gke"
  version = "v0.0.10"

  # depends_on = [
  #   google_compute_network.network,
  #   google_compute_subnetwork.subnet,
  #   google_compute_shared_vpc_host_project.shared_vpc,
  #   google_compute_subnetwork_iam_member.subnet_iam_member,
  #   google_compute_shared_vpc_service_project.vpc_service_project,
  #   google_compute_router_nat.ps_nat,
  #   google_compute_address.nat
  # ]

  cluster_name       = var.cluster_name
  cluster_location   = var.cluster_location
  cluster_project_id = var.project_id

  master_password = var.master_password
  master_username = var.master_username

  release_channel = "REGULAR"

  # Required for private cluster
  enable_private_cluster   = false
  enable_custom_networking = false
  # network_name             = google_compute_network.network.self_link
  # subnet_name              = google_compute_subnetwork.subnet.self_link
  # pod_subnet_name          = google_compute_subnetwork.subnet.secondary_ip_range[0].range_name
  # services_subnet_name     = google_compute_subnetwork.subnet.secondary_ip_range[1].range_name
}

ingress.tf

resource "kubernetes_namespace" "ingress" {
  depends_on = [module.gke_default_pool]
  metadata {
    name = "ingress"
  }
}

module "nginx_ingress" {
  depends_on = [kubernetes_namespace.ingress]
  source     = "app.terraform.io/Simplified/nginx-ingress/helm"
  version    = "v0.0.25"
  namespace  = kubernetes_namespace.ingress.metadata[0].name

  # Required for private cluster
  enable_admission_webhooks = false
}

module "production_issuer" {
  depends_on = [kubernetes_namespace.ingress, module.nginx_ingress]
  source     = "app.terraform.io/Simplified/cert-issuer/helm"
  version    = "v0.0.4"

  name      = "prod"
  namespace = kubernetes_namespace.ingress.metadata[0].name

  acme_email    = "redacted"
  helm_password = var.helm_password

  project_id = var.hub_project_id
}

sqlproxy.tf

resource "kubernetes_namespace" "sql_proxy" {
  depends_on = [module.gke_default_pool]
  metadata {
    name = "sqlproxy"
  }
}

module "sql_proxy" {
  depends_on = [kubernetes_namespace.sql_proxy]
  source     = "app.terraform.io/Simplified/sqlproxy/k8s"
  version    = "v0.0.1"
  namespace  = kubernetes_namespace.sql_proxy.metadata[0].name

  gcp_project_id           = "sasp-ca"
  gcp_service_account_name = "ps-production-sqlproxy"
  instance_name            = "sasp-core-ca"
  instance_region          = "northamerica-northeast1"
}

providers.tf

provider "google" {
  credentials = base64decode(var.gcp_json)
  project     = var.project_id
  region      = trimsuffix(var.cluster_location, "-a")
  zone        = var.cluster_location
}

provider "google-beta" {
  credentials = base64decode(var.gcp_json)
  project     = var.project_id
  region      = trimsuffix(var.cluster_location, "-a")
  zone        = var.cluster_location
}

provider "kubernetes" {
  load_config_file = false
  host             = module.gke_master.endpoint
  token            = module.gke_master.token
  cluster_ca_certificate = base64decode(
    module.gke_master.cluster_ca_certificate
  )
}

provider "helm" {
  kubernetes {
    # load_config_file = false
    host  = module.gke_master.endpoint
    token = module.gke_master.token
    cluster_ca_certificate = base64decode(
      module.gke_master.cluster_ca_certificate
    )
  }
}

versions.tf

terraform {
  required_providers {
    google = {
      source  = "hashicorp/google"
      version = "3.51.0"
    }
    google-beta = {
      source  = "hashicorp/google-beta"
      version = "3.51.0"
    }
    helm = {
      source  = "hashicorp/helm"
      version = "2.0.1"
    }
    kubernetes = {
      source  = "hashicorp/kubernetes"
      version = "1.13.2"
    }
  }
  required_version = ">= 0.13"
}

module main.tf

data "google_client_config" "gcp_cluster" {
}

resource "google_container_cluster" "gcp_cluster" {
  name           = var.cluster_name
  location       = var.cluster_location
  node_locations = var.additional_cluster_zones
  project        = var.cluster_project_id
  release_channel {
    channel = var.release_channel
  }

  # We can't create a cluster with no node pool defined, but we want to only use
  # separately managed node pools. So we create the smallest possible default
  # node pool and immediately delete it.
  remove_default_node_pool = true
  initial_node_count       = 1

  network    = var.network_name
  subnetwork = var.subnet_name

  dynamic "ip_allocation_policy" {
    for_each = var.enable_custom_networking == true ? [var.enable_custom_networking] : []
    content {
      cluster_secondary_range_name  = var.pod_subnet_name
      services_secondary_range_name = var.services_subnet_name
    }
  }

  dynamic "private_cluster_config" {
    for_each = var.enable_private_cluster == true ? [var.enable_private_cluster] : []
    content {
      enable_private_endpoint = false
      enable_private_nodes    = true
      master_ipv4_cidr_block  = var.master_cidr_range
    }
  }

  master_auth {
    username = var.master_username
    password = var.master_password

    client_certificate_config {
      issue_client_certificate = true
    }
  }

  timeouts {
    create = var.cluster_create_timeout
    delete = var.cluster_delete_timeout
  }

  maintenance_policy {
    daily_maintenance_window {
      start_time = var.maintenance_start_time
    }
  }
}

module outputs.tf

output "cluster_username" {
  value = google_container_cluster.gcp_cluster.master_auth.0.username
}

output "cluster_password" {
  value = google_container_cluster.gcp_cluster.master_auth.0.password
}

output "endpoint" {
  value = google_container_cluster.gcp_cluster.endpoint
}

output "instance_group_urls" {
  value = google_container_cluster.gcp_cluster.instance_group_urls
}

output "node_config" {
  value = google_container_cluster.gcp_cluster.node_config
}

output "node_pools" {
  value = google_container_cluster.gcp_cluster.node_pool
}

output "client_certificate" {
  value = google_container_cluster.gcp_cluster.master_auth.0.client_certificate
}

output "client_key" {
  value = google_container_cluster.gcp_cluster.master_auth.0.client_key
}

output "cluster_ca_certificate" {
  value = google_container_cluster.gcp_cluster.master_auth.0.cluster_ca_certificate
}

output "token" {
  value = data.google_client_config.gcp_cluster.access_token
}

module variables.tf

variable "cluster_name" {
  description = "The name of the cluster, unique within the project and location"
}

variable "cluster_location" {
  description = "The location (region or zone) in which the cluster master will be created, as well as the default node location"
}

variable "additional_cluster_zones" {
  description = "The list of zones in which the cluster's nodes are located. Nodes must be in the region of their regional cluster or in the same region as their cluster's zone for zonal clusters"
  type        = list(string)
  default     = []
}

variable "cluster_project_id" {
  description = "The ID of the project in which the resource belongs"
}

# Find release channels here
# https://cloud.google.com/kubernetes-engine/docs/concepts/release-channels
variable "release_channel" {
  description = "GKE master update release channel"
}

variable "master_username" {
  description = "The username to use for HTTP basic authentication when accessing the Kubernetes master endpoint"
}

variable "master_password" {
  description = "The password to use for HTTP basic authentication when accessing the Kubernetes master endpoint"
}

variable "cluster_create_timeout" {
  default = "30m"
}

variable "cluster_delete_timeout" {
  default = "1h"
}

# Time is in GMT timezone
variable "maintenance_start_time" {
  description = "The number of nodes that can be simultaneously unavailable during an upgrade, NOTE time is in GMT timezone"
  default     = "06:00"
}

variable "network_name" {
  description = "name of the network to use"
  default     = "default"
}

variable "subnet_name" {
  description = "name of the subnet to use"
  default     = "default"
}

variable "enable_custom_networking" {
  description = "Enable custom networking"
  type        = bool
  default     = false
}

variable "enable_private_cluster" {
  description = "Enable private cluster"
  type        = bool
  default     = false
}

variable "pod_subnet_name" {
  description = "name of the pod subnet to use"
  default     = "pod-subnet"
}

variable "services_subnet_name" {
  description = "name of the services subnet to use"
  default     = "services-subnet"
}

variable "master_cidr_range" {
  description = "Master CIDR network"
  default     = "172.16.0.32/28"
}

Debug Output


https://gist.github.com/shanehughes1990/12f787bbcd7f22d2ca034e68195ce47e

Crash Output

Expected Behavior


After terraform has applied the config, everything comes up as expected, when attempting to change the enable_private_cluster to true, enable_custom_networking to true, or any other change in the gke_master module, terraform errors, when it should spit out a valid plan with the changes.

Actual Behavior


terraform errors on terraform plan

shanehughes@Ubuntu20Desktop:~/repos/clusters/terraform-ps-cluster$ terraform plan
module.gke_master.google_container_cluster.gcp_cluster: Refreshing state... [id=projects/parksmart-production/locations/northamerica-northeast1-a/clusters/cluster-production]
module.gke_default_pool.google_container_node_pool.gcp_cluster_nodes: Refreshing state... [id=projects/parksmart-production/locations/northamerica-northeast1-a/clusters/cluster-production/nodePools/default-pool]
kubernetes_namespace.ingress: Refreshing state... [id=ingress]
kubernetes_namespace.sql_proxy: Refreshing state... [id=sqlproxy]

Error: Get "http://localhost/api/v1/namespaces/sqlproxy": dial tcp 127.0.0.1:80: connect: connection refused



Error: Get "http://localhost/api/v1/namespaces/ingress": dial tcp 127.0.0.1:80: connect: connection refused

Steps to Reproduce


Hard for me to explain how to reproduce, as these are all private modules, but I guess try bringing up a cluster, some namespaces, and change something in the google_container_cluster resource and try terraform plan again.

Additional Context


This DOES NOT happen on terraform 0.13.5, terraform works as intended.

Currently testing on Ubuntu 20
This "will" be running in terraform cloud, (State is saved there), Was working out all the bugs locally before I let terraform cloud take over.

Running on GKE, RELEASE branch

Server Version: version.Info{Major:"1", Minor:"17+", GitVersion:"v1.17.13-gke.2600", GitCommit:"fc4bf3b03703b51c48ba123e8abc53b3051ba8a7", GitTreeState:"clean", BuildDate:"2020-11-11T09:20:10Z", GoVersion:"go1.13.15b4", Compiler:"gc", Platform:"linux/amd64"}

References

bug

Most helpful comment

Please re-open. This happens for us too. We were happily working away in TF13 with multiple applies, destroys, updates, etc. all working perfectly fine with Infrastructure and K8S and Helm in sub-modules with dependencies between one another in a composite module. We didn't have any issues. However, as soon as we moved to TF14 - boom! It all stopped working, and we get the exact same message as mentioned above.

I am afraid that targeted applies will greatly increase the deployment time for us - it does feel like a "go-to" response for these types of challenges in various bug reports, it would be great to simply get them resolved. Evidently, something has broken between TF13 and TF14.

It should also be noted that if we do a destroy, then everything works as expected and TF can successfully connect to K8S to destroy the required resources. This only occurs during a plan against already applied infrastructure.

We have since moved all in cluster resources to a different state and everything works as intended. That would be my suggestion to you aswell is just make sure you have no kubernetes resources in the same state as the resources to build the cluster

All 8 comments

@shanehughes1990 this appears to be happening because the cluster is being configured in the same apply operation as your other resources (described in the docs here as an unstable setup - https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs#stacking-with-managed-kubernetes-cluster-resources)

So okay, because it has to delete the namespace first before recreating the master that's the cause of it? Why does it only happen on terraform 14 aswell, it works as I expect it too on 1.13.5. Is there a way around this as using terraform cloud in this sense does not work it fails? Would a destroy resources before changing gke setup be the solution?

@shanehughes1990 the difficulty you're describing with varying behaviour between versions is why we discourage having cluster configuration in the same apply as other resources. On your local machine, you could use -target to do more directed applies, but in terraform cloud you would need to configure run triggers in order to have this separation. We are aware of this general issue and tracking use cases and progress in https://github.com/hashicorp/terraform/issues/4149.

You can learn more on how to use run triggers by following the learn guide

I'll close this issue for now - please reopen if you continue to face difficulty.

Please re-open. This happens for us too. We were happily working away in TF13 with multiple applies, destroys, updates, etc. all working perfectly fine with Infrastructure and K8S and Helm in sub-modules with dependencies between one another in a composite module. We didn't have any issues. However, as soon as we moved to TF14 - boom! It all stopped working, and we get the exact same message as mentioned above.

I am afraid that targeted applies will greatly increase the deployment time for us - it does feel like a "go-to" response for these types of challenges in various bug reports, it would be great to simply get them resolved. Evidently, something has broken between TF13 and TF14.

It should also be noted that if we do a destroy, then everything works as expected and TF can successfully connect to K8S to destroy the required resources. This only occurs during a plan against already applied infrastructure.

Please re-open. This happens for us too. We were happily working away in TF13 with multiple applies, destroys, updates, etc. all working perfectly fine with Infrastructure and K8S and Helm in sub-modules with dependencies between one another in a composite module. We didn't have any issues. However, as soon as we moved to TF14 - boom! It all stopped working, and we get the exact same message as mentioned above.

I am afraid that targeted applies will greatly increase the deployment time for us - it does feel like a "go-to" response for these types of challenges in various bug reports, it would be great to simply get them resolved. Evidently, something has broken between TF13 and TF14.

It should also be noted that if we do a destroy, then everything works as expected and TF can successfully connect to K8S to destroy the required resources. This only occurs during a plan against already applied infrastructure.

We have since moved all in cluster resources to a different state and everything works as intended. That would be my suggestion to you aswell is just make sure you have no kubernetes resources in the same state as the resources to build the cluster

I'm going to lock this issue because it has been closed for _30 days_ ⏳. This helps our maintainers find and focus on the active issues.

If you feel this issue should be reopened, we encourage creating a new issue linking back to this one for added context. If you feel I made an error 🤖 🙉 , please reach out to my human friends 👉 [email protected]. Thanks!

Was this page helpful?
0 / 5 - 0 ratings