diff --git a/src/main/kotlin/org/domaindrivenarchitecture/provs/framework/extensions/server_software/standalone_server/prometheus/base/Prometheus.kt b/src/main/kotlin/org/domaindrivenarchitecture/provs/framework/extensions/server_software/standalone_server/prometheus/base/Prometheus.kt index f55b5f6..4df54d0 100644 --- a/src/main/kotlin/org/domaindrivenarchitecture/provs/framework/extensions/server_software/standalone_server/prometheus/base/Prometheus.kt +++ b/src/main/kotlin/org/domaindrivenarchitecture/provs/framework/extensions/server_software/standalone_server/prometheus/base/Prometheus.kt @@ -11,7 +11,7 @@ internal val configDir = "/etc/prometheus/" internal val configFile = "prometheus.yml" -fun Prov.configurePrometheusDocker(config: String = prometheusDefaultConfig) = task { +fun Prov.configurePrometheusDocker(config: String = prometheusDefaultConfig()) = task { createDirs(configDir, sudo = true) createFile(configDir + configFile, config, sudo = true) } @@ -51,25 +51,22 @@ fun Prov.runPrometheusDocker(nginxHost: String? = null) = task { } -private const val prometheusDefaultConfig = +private fun prometheusDefaultConfig() = """ global: - scrape_interval: 15s # By default, scrape targets every 15 seconds. - - # Attach these labels to any time series or alerts when communicating with - # external systems (federation, remote storage, Alertmanager). - external_labels: - monitor: 'codelab-monitor' + scrape_interval: 15s # By default, scrape targets every 15 seconds. # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. scrape_configs: # The job name is added as a label `job=` to any timeseries scraped from this config. - job_name: 'prometheus' - - # Override the global default and scrape targets from this job every 5 seconds. - scrape_interval: 5s - static_configs: - targets: ['localhost:9090'] + +remote_write: + - url: "" + basic_auth: + username: "your grafana username" + password: "your Grafana API key" """ diff --git a/src/main/kotlin/org/domaindrivenarchitecture/provs/framework/ubuntu/secret/SecretSource.kt b/src/main/kotlin/org/domaindrivenarchitecture/provs/framework/ubuntu/secret/SecretSource.kt index dc491e7..663bfc6 100644 --- a/src/main/kotlin/org/domaindrivenarchitecture/provs/framework/ubuntu/secret/SecretSource.kt +++ b/src/main/kotlin/org/domaindrivenarchitecture/provs/framework/ubuntu/secret/SecretSource.kt @@ -30,10 +30,8 @@ enum class SecretSourceType() { @Serializable -@Suppress("unused") // for use in other projects class SecretSupplier(private val source: SecretSourceType, val parameter: String) { fun secret(): Secret { return source.secret(parameter) } } - diff --git a/src/main/kotlin/org/domaindrivenarchitecture/provs/server/domain/k3s/K3sService.kt b/src/main/kotlin/org/domaindrivenarchitecture/provs/server/domain/k3s/K3sService.kt index d62913d..d49a209 100644 --- a/src/main/kotlin/org/domaindrivenarchitecture/provs/server/domain/k3s/K3sService.kt +++ b/src/main/kotlin/org/domaindrivenarchitecture/provs/server/domain/k3s/K3sService.kt @@ -1,15 +1,16 @@ package org.domaindrivenarchitecture.provs.server.domain.k3s import org.domaindrivenarchitecture.provs.framework.core.Prov -import org.domaindrivenarchitecture.provs.framework.core.ProvResult +import org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent.GrafanaAgentConfigResolved +import org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent.provisionGrafanaAgent import org.domaindrivenarchitecture.provs.server.infrastructure.* -import org.domaindrivenarchitecture.provs.server.infrastructure.getK3sConfig /** * Installs a k3s server. */ fun Prov.provisionK3s(cli: K3sCliCommand) = task { val k3sConfig: K3sConfig = getK3sConfig(cli.configFileName) + val grafanaConfigResolved: GrafanaAgentConfigResolved? = findK8sGrafanaConfig(cli.configFileName)?.resolveSecret() provisionNetwork(k3sConfig) if (k3sConfig.reprovision && testConfigExists()) { @@ -21,11 +22,16 @@ fun Prov.provisionK3s(cli: K3sCliCommand) = task { if (k3sConfig.certmanager != null) { provisionK3sCertManager(k3sConfig.certmanager) } + if (k3sConfig.echo == true) { provisionK3sEcho(k3sConfig.fqdn, k3sConfig.certmanager?.letsencryptEndpoint) } + + if (grafanaConfigResolved != null) { + provisionGrafanaAgent(grafanaConfigResolved) + } + if (cli.applicationFileName != null) { provisionK3sApplication(cli.applicationFileName) } - ProvResult(true) } diff --git a/src/main/kotlin/org/domaindrivenarchitecture/provs/server/domain/k8s_grafana_agent/GrafanaAgent.kt b/src/main/kotlin/org/domaindrivenarchitecture/provs/server/domain/k8s_grafana_agent/GrafanaAgent.kt new file mode 100644 index 0000000..f42f5a2 --- /dev/null +++ b/src/main/kotlin/org/domaindrivenarchitecture/provs/server/domain/k8s_grafana_agent/GrafanaAgent.kt @@ -0,0 +1,9 @@ +package org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent + +import org.domaindrivenarchitecture.provs.framework.core.Prov +import org.domaindrivenarchitecture.provs.server.infrastructure.provisionGrafanaAgentForK8s + + +fun Prov.provisionGrafanaAgent(configResolved: GrafanaAgentConfigResolved) = + provisionGrafanaAgentForK8s(configResolved.user, configResolved.password, configResolved.cluster) + diff --git a/src/main/kotlin/org/domaindrivenarchitecture/provs/server/domain/k8s_grafana_agent/GrafanaAgentConfig.kt b/src/main/kotlin/org/domaindrivenarchitecture/provs/server/domain/k8s_grafana_agent/GrafanaAgentConfig.kt new file mode 100644 index 0000000..2cd7db2 --- /dev/null +++ b/src/main/kotlin/org/domaindrivenarchitecture/provs/server/domain/k8s_grafana_agent/GrafanaAgentConfig.kt @@ -0,0 +1,25 @@ +package org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent + +import kotlinx.serialization.Serializable +import org.domaindrivenarchitecture.provs.framework.core.Secret +import org.domaindrivenarchitecture.provs.framework.ubuntu.secret.SecretSupplier + +@Serializable +data class GrafanaAgentConfig( + val user: String, + val password: SecretSupplier, + val cluster: String +) { + fun resolveSecret(): GrafanaAgentConfigResolved = GrafanaAgentConfigResolved(this) +} + +data class GrafanaAgentConfigResolved(val configUnresolved: GrafanaAgentConfig) { + val user: String = configUnresolved.user + val password: Secret = configUnresolved.password.secret() + val cluster: String = configUnresolved.cluster +} + +@Serializable +data class GrafanaAgentConfigHolder( + val grafana: GrafanaAgentConfig +) diff --git a/src/main/kotlin/org/domaindrivenarchitecture/provs/server/infrastructure/GrafanaAgent.kt b/src/main/kotlin/org/domaindrivenarchitecture/provs/server/infrastructure/GrafanaAgent.kt new file mode 100644 index 0000000..01f1827 --- /dev/null +++ b/src/main/kotlin/org/domaindrivenarchitecture/provs/server/infrastructure/GrafanaAgent.kt @@ -0,0 +1,64 @@ +package org.domaindrivenarchitecture.provs.server.infrastructure + +import org.domaindrivenarchitecture.provs.framework.core.Prov +import org.domaindrivenarchitecture.provs.framework.core.Secret +import org.domaindrivenarchitecture.provs.framework.ubuntu.filesystem.base.createFileFromResource +import org.domaindrivenarchitecture.provs.framework.ubuntu.filesystem.base.createFileFromResourceTemplate +import org.domaindrivenarchitecture.provs.server.domain.k3s.FileMode +import java.io.File + + +private const val grafanaResourceDir = "org/domaindrivenarchitecture/provs/server/infrastructure/grafana/" + + +fun Prov.provisionGrafanaAgentForK8s(user: String, password: Secret, clusterName: String) = task { + val namespace = "monitoring" + + // Create namespace if not yet existing + if (!chk("kubectl get namespace $namespace")) { + cmd("kubectl create namespace $namespace") + } + + // Deploy grafana-agent + applyGrafanaFileFromResource(File(k3sManualManifestsDir, "grafana-agent.yaml")) + + // Deploy node-exporter + applyGrafanaFileFromResource(File(k3sManualManifestsDir, "node-exporter-daemon-set.yaml")) + + // Deploy grafana config + createFileFromResourceTemplate( + k3sManualManifestsDir + "grafana-agent-config-map.yaml", + "grafana-agent-config-map.template.yaml", + resourcePath = grafanaResourceDir, + posixFilePermission = "644", + values = mapOf( + "USERNAME" to user, + "APIKEY" to password.plain(), + "CLUSTERNAME" to clusterName, + ) + ) + cmd("export NAMESPACE=$namespace && kubectl apply -n \$NAMESPACE -f grafana-agent-config-map.yaml", k3sManualManifestsDir) + + // restart grafana-agent + cmd("kubectl -n $namespace rollout restart deployment/grafana-agent") +} + +// ============================ private functions ============================= + +private fun Prov.createGrafanaFileFromResource( + file: File, + posixFilePermission: FileMode? = "644" +) = task { + createFileFromResource( + file.path, + file.name, + grafanaResourceDir, + posixFilePermission, + sudo = true + ) +} + +private fun Prov.applyGrafanaFileFromResource(file: File, posixFilePermission: String? = "644") = task { + createGrafanaFileFromResource(file, posixFilePermission) + cmd("kubectl apply -f ${file.path}", sudo = true) +} diff --git a/src/main/kotlin/org/domaindrivenarchitecture/provs/server/infrastructure/GrafanaAgentRepository.kt b/src/main/kotlin/org/domaindrivenarchitecture/provs/server/infrastructure/GrafanaAgentRepository.kt new file mode 100644 index 0000000..abcb106 --- /dev/null +++ b/src/main/kotlin/org/domaindrivenarchitecture/provs/server/infrastructure/GrafanaAgentRepository.kt @@ -0,0 +1,30 @@ +package org.domaindrivenarchitecture.provs.server.infrastructure + +import org.domaindrivenarchitecture.provs.configuration.domain.ConfigFileName +import org.domaindrivenarchitecture.provs.framework.core.readFromFile +import org.domaindrivenarchitecture.provs.framework.core.toYaml +import org.domaindrivenarchitecture.provs.framework.core.yamlToType +import org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent.GrafanaAgentConfigHolder +import org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent.GrafanaAgentConfig +import java.io.File +import java.io.FileWriter + + +private const val DEFAULT_CONFIG_FILE = "server-config.yaml" + + +fun findK8sGrafanaConfig(fileName: ConfigFileName? = null): GrafanaAgentConfig? { + val filePath = fileName?.fileName ?: DEFAULT_CONFIG_FILE + + // create a default config + return if (File(filePath).exists()) { + readFromFile(filePath).yamlToType().grafana + } else { + null + } +} + + +@Suppress("unused") +internal fun writeConfig(config: GrafanaAgentConfigHolder, fileName: String = "grafana-config.yaml") = + FileWriter(fileName).use { it.write(config.toYaml()) } diff --git a/src/main/kotlin/org/domaindrivenarchitecture/provs/server/infrastructure/K3s.kt b/src/main/kotlin/org/domaindrivenarchitecture/provs/server/infrastructure/K3s.kt index 4622fd7..faf674f 100644 --- a/src/main/kotlin/org/domaindrivenarchitecture/provs/server/infrastructure/K3s.kt +++ b/src/main/kotlin/org/domaindrivenarchitecture/provs/server/infrastructure/K3s.kt @@ -13,11 +13,12 @@ import java.io.File // ----------------------------------- directories -------------------------------- -private const val k3sResourceDir = "org/domaindrivenarchitecture/provs/server/infrastructure/k3s/" +const val k3sManualManifestsDir = "/etc/rancher/k3s/manifests/" -private const val k8sCredentialsDir = "/etc/kubernetes/" private const val k3sAutomatedManifestsDir = "/var/lib/rancher/k3s/server/manifests/" -private const val k3sManualManifestsDir = "/etc/rancher/k3s/manifests/" +private const val k8sCredentialsDir = "/etc/kubernetes/" + +private const val k3sResourceDir = "org/domaindrivenarchitecture/provs/server/infrastructure/k3s/" // ----------------------------------- files -------------------------------- @@ -146,7 +147,11 @@ fun Prov.provisionK3sApplication(applicationFileName: ApplicationFileName) = tas } -// ============================ private functions ============================= +// ============================ private and internal functions ============================= + +internal fun Prov.applyK3sFile(file: File) = task { + cmd("kubectl apply -f ${file.path}", sudo = true) +} private fun Prov.createK3sFileFromResource( file: File, @@ -192,10 +197,6 @@ private fun Prov.createK3sFileFromResourceTemplate( ) } -private fun Prov.applyK3sFile(file: File) = task { - cmd("kubectl apply -f ${file.path}", sudo = true) -} - private fun File.templateName(): String { return this.name.replace(".yaml", ".template.yaml") } diff --git a/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/grafana-agent-config-map.template.yaml b/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/grafana-agent-config-map.template.yaml new file mode 100644 index 0000000..256003c --- /dev/null +++ b/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/grafana-agent-config-map.template.yaml @@ -0,0 +1,112 @@ +kind: ConfigMap +metadata: + name: grafana-agent + namespace: monitoring +apiVersion: v1 +data: + agent.yaml: | + server: + http_listen_port: 12345 + metrics: + wal_directory: /tmp/grafana-agent-wal + global: + scrape_interval: 60s + external_labels: + cluster: $CLUSTERNAME + configs: + - name: integrations + remote_write: + - url: https://prometheus-prod-01-eu-west-0.grafana.net/api/prom/push + basic_auth: + username: $USERNAME + password: $APIKEY + scrape_configs: + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: integrations/kubernetes/cadvisor + kubernetes_sd_configs: + - role: node + metric_relabel_configs: + - source_labels: [__name__] + regex: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|container_memory_swap|kube_statefulset_status_replicas_ready|kube_node_spec_taint|storage_operation_duration_seconds_count|kubelet_pleg_relist_duration_seconds_count|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|kube_daemonset_status_number_available|kubelet_certificate_manager_server_ttl_seconds|container_network_receive_packets_total|namespace_workload_pod|kubelet_pod_worker_duration_seconds_count|kube_statefulset_status_replicas_updated|kube_pod_status_phase|volume_manager_total_volumes|kubelet_running_containers|storage_operation_errors_total|kube_statefulset_status_observed_generation|kubelet_node_config_error|container_fs_reads_bytes_total|kube_horizontalpodautoscaler_status_current_replicas|rest_client_request_duration_seconds_bucket|container_memory_cache|kube_daemonset_updated_number_scheduled|kube_job_spec_completions|kubelet_volume_stats_capacity_bytes|kube_daemonset_status_number_misscheduled|container_memory_rss|namespace_cpu:kube_pod_container_resource_limits:sum|kubelet_volume_stats_inodes_used|container_network_transmit_bytes_total|kubelet_runtime_operations_errors_total|container_fs_writes_total|kube_daemonset_status_current_number_scheduled|kube_node_status_capacity|container_network_receive_packets_dropped_total|storage_operation_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|container_network_receive_bytes_total|kubelet_runtime_operations_total|kubelet_pod_start_duration_seconds_count|kube_horizontalpodautoscaler_status_desired_replicas|container_cpu_usage_seconds_total|up|kube_resourcequota|kubelet_cgroup_manager_duration_seconds_bucket|kube_horizontalpodautoscaler_spec_max_replicas|kubelet_server_expiration_renew_errors|container_fs_writes_bytes_total|kubelet_pod_worker_duration_seconds_bucket|machine_memory_bytes|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|kube_statefulset_status_current_revision|kube_deployment_status_observed_generation|kubelet_volume_stats_inodes|kubelet_volume_stats_available_bytes|kube_deployment_status_replicas_available|container_network_transmit_packets_total|kubelet_pleg_relist_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|container_cpu_cfs_periods_total|process_cpu_seconds_total|kubelet_pleg_relist_interval_seconds_bucket|kube_horizontalpodautoscaler_spec_min_replicas|kube_statefulset_status_replicas|kube_deployment_spec_replicas|namespace_workload_pod:kube_pod_owner:relabel|kube_pod_owner|node_namespace_pod_container:container_memory_working_set_bytes|kubelet_node_name|kube_job_failed|container_memory_working_set_bytes|kubelet_runtime_operations_duration_seconds_bucket|namespace_memory:kube_pod_container_resource_requests:sum|kube_node_info|kubernetes_build_info|kube_statefulset_status_update_revision|kube_pod_container_resource_limits|kubelet_running_pods|kube_statefulset_replicas|kube_namespace_created|node_namespace_pod_container:container_memory_rss|node_namespace_pod_container:container_memory_cache|kube_node_status_condition|kube_pod_container_resource_requests|kubelet_running_pod_count|namespace_memory:kube_pod_container_resource_limits:sum|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_pod_container_status_waiting_reason|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_running_container_count|node_namespace_pod_container:container_memory_swap|kube_deployment_status_replicas_updated|process_resident_memory_bytes|rest_client_requests_total|kube_node_status_allocatable|kube_statefulset_metadata_generation|kube_deployment_metadata_generation|container_network_transmit_packets_dropped_total|kube_pod_info|kubelet_cgroup_manager_duration_seconds_count|container_fs_reads_total|kube_daemonset_status_desired_number_scheduled|container_cpu_cfs_throttled_periods_total|kube_job_status_succeeded|kubelet_certificate_manager_client_ttl_seconds|kube_replicaset_owner|go_goroutines|namespace_cpu:kube_pod_container_resource_requests:sum + action: keep + relabel_configs: + - replacement: kubernetes.default.svc.cluster.local:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: false + server_name: kubernetes + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: integrations/kubernetes/kubelet + kubernetes_sd_configs: + - role: node + metric_relabel_configs: + - source_labels: [__name__] + regex: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|container_memory_swap|kube_statefulset_status_replicas_ready|kube_node_spec_taint|storage_operation_duration_seconds_count|kubelet_pleg_relist_duration_seconds_count|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|kube_daemonset_status_number_available|kubelet_certificate_manager_server_ttl_seconds|container_network_receive_packets_total|namespace_workload_pod|kubelet_pod_worker_duration_seconds_count|kube_statefulset_status_replicas_updated|kube_pod_status_phase|volume_manager_total_volumes|kubelet_running_containers|storage_operation_errors_total|kube_statefulset_status_observed_generation|kubelet_node_config_error|container_fs_reads_bytes_total|kube_horizontalpodautoscaler_status_current_replicas|rest_client_request_duration_seconds_bucket|container_memory_cache|kube_daemonset_updated_number_scheduled|kube_job_spec_completions|kubelet_volume_stats_capacity_bytes|kube_daemonset_status_number_misscheduled|container_memory_rss|namespace_cpu:kube_pod_container_resource_limits:sum|kubelet_volume_stats_inodes_used|container_network_transmit_bytes_total|kubelet_runtime_operations_errors_total|container_fs_writes_total|kube_daemonset_status_current_number_scheduled|kube_node_status_capacity|container_network_receive_packets_dropped_total|storage_operation_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|container_network_receive_bytes_total|kubelet_runtime_operations_total|kubelet_pod_start_duration_seconds_count|kube_horizontalpodautoscaler_status_desired_replicas|container_cpu_usage_seconds_total|up|kube_resourcequota|kubelet_cgroup_manager_duration_seconds_bucket|kube_horizontalpodautoscaler_spec_max_replicas|kubelet_server_expiration_renew_errors|container_fs_writes_bytes_total|kubelet_pod_worker_duration_seconds_bucket|machine_memory_bytes|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|kube_statefulset_status_current_revision|kube_deployment_status_observed_generation|kubelet_volume_stats_inodes|kubelet_volume_stats_available_bytes|kube_deployment_status_replicas_available|container_network_transmit_packets_total|kubelet_pleg_relist_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|container_cpu_cfs_periods_total|process_cpu_seconds_total|kubelet_pleg_relist_interval_seconds_bucket|kube_horizontalpodautoscaler_spec_min_replicas|kube_statefulset_status_replicas|kube_deployment_spec_replicas|namespace_workload_pod:kube_pod_owner:relabel|kube_pod_owner|node_namespace_pod_container:container_memory_working_set_bytes|kubelet_node_name|kube_job_failed|container_memory_working_set_bytes|kubelet_runtime_operations_duration_seconds_bucket|namespace_memory:kube_pod_container_resource_requests:sum|kube_node_info|kubernetes_build_info|kube_statefulset_status_update_revision|kube_pod_container_resource_limits|kubelet_running_pods|kube_statefulset_replicas|kube_namespace_created|node_namespace_pod_container:container_memory_rss|node_namespace_pod_container:container_memory_cache|kube_node_status_condition|kube_pod_container_resource_requests|kubelet_running_pod_count|namespace_memory:kube_pod_container_resource_limits:sum|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_pod_container_status_waiting_reason|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_running_container_count|node_namespace_pod_container:container_memory_swap|kube_deployment_status_replicas_updated|process_resident_memory_bytes|rest_client_requests_total|kube_node_status_allocatable|kube_statefulset_metadata_generation|kube_deployment_metadata_generation|container_network_transmit_packets_dropped_total|kube_pod_info|kubelet_cgroup_manager_duration_seconds_count|container_fs_reads_total|kube_daemonset_status_desired_number_scheduled|container_cpu_cfs_throttled_periods_total|kube_job_status_succeeded|kubelet_certificate_manager_client_ttl_seconds|kube_replicaset_owner|go_goroutines|namespace_cpu:kube_pod_container_resource_requests:sum + action: keep + relabel_configs: + - replacement: kubernetes.default.svc.cluster.local:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/${1}/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: false + server_name: kubernetes + - job_name: integrations/kubernetes/kube-state-metrics + kubernetes_sd_configs: + - role: service + metric_relabel_configs: + - source_labels: [__name__] + regex: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|container_memory_swap|kube_statefulset_status_replicas_ready|kube_node_spec_taint|storage_operation_duration_seconds_count|kubelet_pleg_relist_duration_seconds_count|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|kube_daemonset_status_number_available|kubelet_certificate_manager_server_ttl_seconds|container_network_receive_packets_total|namespace_workload_pod|kubelet_pod_worker_duration_seconds_count|kube_statefulset_status_replicas_updated|kube_pod_status_phase|volume_manager_total_volumes|kubelet_running_containers|storage_operation_errors_total|kube_statefulset_status_observed_generation|kubelet_node_config_error|container_fs_reads_bytes_total|kube_horizontalpodautoscaler_status_current_replicas|rest_client_request_duration_seconds_bucket|container_memory_cache|kube_daemonset_updated_number_scheduled|kube_job_spec_completions|kubelet_volume_stats_capacity_bytes|kube_daemonset_status_number_misscheduled|container_memory_rss|namespace_cpu:kube_pod_container_resource_limits:sum|kubelet_volume_stats_inodes_used|container_network_transmit_bytes_total|kubelet_runtime_operations_errors_total|container_fs_writes_total|kube_daemonset_status_current_number_scheduled|kube_node_status_capacity|container_network_receive_packets_dropped_total|storage_operation_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|container_network_receive_bytes_total|kubelet_runtime_operations_total|kubelet_pod_start_duration_seconds_count|kube_horizontalpodautoscaler_status_desired_replicas|container_cpu_usage_seconds_total|up|kube_resourcequota|kubelet_cgroup_manager_duration_seconds_bucket|kube_horizontalpodautoscaler_spec_max_replicas|kubelet_server_expiration_renew_errors|container_fs_writes_bytes_total|kubelet_pod_worker_duration_seconds_bucket|machine_memory_bytes|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|kube_statefulset_status_current_revision|kube_deployment_status_observed_generation|kubelet_volume_stats_inodes|kubelet_volume_stats_available_bytes|kube_deployment_status_replicas_available|container_network_transmit_packets_total|kubelet_pleg_relist_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|container_cpu_cfs_periods_total|process_cpu_seconds_total|kubelet_pleg_relist_interval_seconds_bucket|kube_horizontalpodautoscaler_spec_min_replicas|kube_statefulset_status_replicas|kube_deployment_spec_replicas|namespace_workload_pod:kube_pod_owner:relabel|kube_pod_owner|node_namespace_pod_container:container_memory_working_set_bytes|kubelet_node_name|kube_job_failed|container_memory_working_set_bytes|kubelet_runtime_operations_duration_seconds_bucket|namespace_memory:kube_pod_container_resource_requests:sum|kube_node_info|kubernetes_build_info|kube_statefulset_status_update_revision|kube_pod_container_resource_limits|kubelet_running_pods|kube_statefulset_replicas|kube_namespace_created|node_namespace_pod_container:container_memory_rss|node_namespace_pod_container:container_memory_cache|kube_node_status_condition|kube_pod_container_resource_requests|kubelet_running_pod_count|namespace_memory:kube_pod_container_resource_limits:sum|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_pod_container_status_waiting_reason|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_running_container_count|node_namespace_pod_container:container_memory_swap|kube_deployment_status_replicas_updated|process_resident_memory_bytes|rest_client_requests_total|kube_node_status_allocatable|kube_statefulset_metadata_generation|kube_deployment_metadata_generation|container_network_transmit_packets_dropped_total|kube_pod_info|kubelet_cgroup_manager_duration_seconds_count|container_fs_reads_total|kube_daemonset_status_desired_number_scheduled|container_cpu_cfs_throttled_periods_total|kube_job_status_succeeded|kubelet_certificate_manager_client_ttl_seconds|kube_replicaset_owner|go_goroutines|namespace_cpu:kube_pod_container_resource_requests:sum + action: keep + relabel_configs: + - action: keep + regex: ksm-kube-state-metrics + source_labels: + - __meta_kubernetes_service_name + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: 'integrations/node_exporter' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_endpoints_name] + regex: 'node-exporter' + action: keep + # relabel 'instance' + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: instance + + integrations: + prometheus_remote_write: + - url: https://prometheus-prod-01-eu-west-0.grafana.net/api/prom/push + basic_auth: + username: $USERNAME + password: $APIKEY + +# logs example +# logs: +# configs: +# - name: integrations +# clients: +# - url: https://logs-prod-eu-west-0.grafana.net/api/prom/push +# basic_auth: +# username: 195593 +# password: $APIKEY +# external_labels: +# cluster: cloud +# positions: +# filename: /tmp/positions.yaml +# target_config: +# sync_period: 10s diff --git a/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/grafana-agent.yaml b/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/grafana-agent.yaml new file mode 100644 index 0000000..b78a117 --- /dev/null +++ b/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/grafana-agent.yaml @@ -0,0 +1,83 @@ +# https://raw.githubusercontent.com/grafana/agent/v0.23.0/production/kubernetes/agent-bare.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: grafana-agent + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: grafana-agent +rules: + - apiGroups: + - "" + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: + - get + - list + - watch + - nonResourceURLs: + - /metrics + verbs: + - get +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: grafana-agent +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: grafana-agent +subjects: + - kind: ServiceAccount + name: grafana-agent + namespace: monitoring +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana-agent + namespace: monitoring +spec: + minReadySeconds: 10 + replicas: 1 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: grafana-agent + template: + metadata: + labels: + name: grafana-agent + spec: + containers: + - args: + - -config.file=/etc/agent/agent.yaml + command: + - /bin/agent + env: + - name: HOSTNAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + image: grafana/agent:v0.23.0 + imagePullPolicy: IfNotPresent + name: agent + ports: + - containerPort: 12345 + name: http-metrics + volumeMounts: + - mountPath: /etc/agent + name: grafana-agent + serviceAccount: grafana-agent + volumes: + - configMap: + name: grafana-agent + name: grafana-agent diff --git a/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/node-exporter-daemon-set.yaml b/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/node-exporter-daemon-set.yaml new file mode 100644 index 0000000..4ddbbf0 --- /dev/null +++ b/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/node-exporter-daemon-set.yaml @@ -0,0 +1,96 @@ +# see https://devopscube.com/node-exporter-kubernetes/ +# and https://www.opsramp.com/prometheus-monitoring/prometheus-node-exporter/ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + name: node-exporter + namespace: monitoring +spec: + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + template: + metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + spec: + containers: + - args: + - --path.sysfs=/host/sys + - --path.rootfs=/host/root + - --no-collector.wifi + - --no-collector.hwmon + - --no-collector.infiniband + - --no-collector.filefd + - --no-collector.ipvs + - --no-collector.mdadm + - --no-collector.netclass + - --no-collector.netstat + - --no-collector.nfsd + - --no-collector.nvme + - --no-collector.powersupplyclass + - --no-collector.pressure + - --no-collector.rapl + - --no-collector.schedstat + - --no-collector.sockstat + - --no-collector.softnet + - --no-collector.tapestats + - --no-collector.thermal_zone + - --no-collector.udp_queues + - --no-collector.xfs + - --no-collector.zfs + - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) + - --collector.netclass.ignored-devices=^(veth.*)$ + name: node-exporter + image: prom/node-exporter + ports: + - containerPort: 9100 + protocol: TCP + resources: + limits: + cpu: 500m + memory: 180Mi + requests: + cpu: 102m + memory: 180Mi + volumeMounts: + - mountPath: /host/sys + mountPropagation: HostToContainer + name: sys + readOnly: true + - mountPath: /host/root + mountPropagation: HostToContainer + name: root + readOnly: true + volumes: + - hostPath: + path: /sys + name: sys + - hostPath: + path: / + name: root +--- +kind: Service +apiVersion: v1 +metadata: + name: node-exporter + namespace: monitoring + labels: + instance: primary + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '9100' +spec: + selector: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + ports: + - name: node-exporter + protocol: TCP + port: 9100 + targetPort: 9100 diff --git a/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/prom.yaml b/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/prom.yaml new file mode 100644 index 0000000..2b63d8a --- /dev/null +++ b/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/prom.yaml @@ -0,0 +1,22 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: prometheus + namespace: monitoring + labels: + app: prometheus +spec: + image: quay.io/prometheus/prometheus:v2.22.1 + nodeSelector: + kubernetes.io/os: linux + replicas: 2 + resources: + requests: + memory: 400Mi + securityContext: + fsGroup: 2000 + runAsNonRoot: true + runAsUser: 1000 + serviceAccountName: prometheus + version: v2.22.1 + serviceMonitorSelector: {} diff --git a/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/prom_rbac.yaml b/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/prom_rbac.yaml new file mode 100644 index 0000000..57dd41d --- /dev/null +++ b/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/prom_rbac.yaml @@ -0,0 +1,42 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/metrics + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: + - configmaps + verbs: ["get"] + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: ["get", "list", "watch"] + - nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: + - kind: ServiceAccount + name: prometheus + namespace: monitoring diff --git a/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/prom_svc.yaml b/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/prom_svc.yaml new file mode 100644 index 0000000..8c3ef8b --- /dev/null +++ b/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/prom_svc.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: monitoring + labels: + app: prometheus +spec: + ports: + - name: web + port: 9090 + targetPort: web + selector: + app: prometheus + sessionAffinity: ClientIP diff --git a/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/prom_svc_mon.yaml b/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/prom_svc_mon.yaml new file mode 100644 index 0000000..70c6d44 --- /dev/null +++ b/src/main/resources/org/domaindrivenarchitecture/provs/server/infrastructure/grafana/prom_svc_mon.yaml @@ -0,0 +1,14 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: prometheus-self + namespace: monitoring + labels: + app: prometheus +spec: + endpoints: + - interval: 30s + port: web + selector: + matchLabels: + app: prometheus