add grafana-agent

This commit is contained in:
ansgarz 2022-05-08 18:52:50 +02:00
parent 968c5d01d6
commit 85697f43f9
15 changed files with 539 additions and 25 deletions

View file

@ -11,7 +11,7 @@ internal val configDir = "/etc/prometheus/"
internal val configFile = "prometheus.yml"
fun Prov.configurePrometheusDocker(config: String = prometheusDefaultConfig) = task {
fun Prov.configurePrometheusDocker(config: String = prometheusDefaultConfig()) = task {
createDirs(configDir, sudo = true)
createFile(configDir + configFile, config, sudo = true)
}
@ -51,25 +51,22 @@ fun Prov.runPrometheusDocker(nginxHost: String? = null) = task {
}
private const val prometheusDefaultConfig =
private fun prometheusDefaultConfig() =
"""
global:
scrape_interval: 15s # By default, scrape targets every 15 seconds.
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'codelab-monitor'
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
static_configs:
- targets: ['localhost:9090']
remote_write:
- url: "<Your Metrics instance remote_write endpoint>"
basic_auth:
username: "your grafana username"
password: "your Grafana API key"
"""

View file

@ -30,10 +30,8 @@ enum class SecretSourceType() {
@Serializable
@Suppress("unused") // for use in other projects
class SecretSupplier(private val source: SecretSourceType, val parameter: String) {
fun secret(): Secret {
return source.secret(parameter)
}
}

View file

@ -1,15 +1,16 @@
package org.domaindrivenarchitecture.provs.server.domain.k3s
import org.domaindrivenarchitecture.provs.framework.core.Prov
import org.domaindrivenarchitecture.provs.framework.core.ProvResult
import org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent.GrafanaAgentConfigResolved
import org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent.provisionGrafanaAgent
import org.domaindrivenarchitecture.provs.server.infrastructure.*
import org.domaindrivenarchitecture.provs.server.infrastructure.getK3sConfig
/**
* Installs a k3s server.
*/
fun Prov.provisionK3s(cli: K3sCliCommand) = task {
val k3sConfig: K3sConfig = getK3sConfig(cli.configFileName)
val grafanaConfigResolved: GrafanaAgentConfigResolved? = findK8sGrafanaConfig(cli.configFileName)?.resolveSecret()
provisionNetwork(k3sConfig)
if (k3sConfig.reprovision && testConfigExists()) {
@ -21,11 +22,16 @@ fun Prov.provisionK3s(cli: K3sCliCommand) = task {
if (k3sConfig.certmanager != null) {
provisionK3sCertManager(k3sConfig.certmanager)
}
if (k3sConfig.echo == true) {
provisionK3sEcho(k3sConfig.fqdn, k3sConfig.certmanager?.letsencryptEndpoint)
}
if (grafanaConfigResolved != null) {
provisionGrafanaAgent(grafanaConfigResolved)
}
if (cli.applicationFileName != null) {
provisionK3sApplication(cli.applicationFileName)
}
ProvResult(true)
}

View file

@ -0,0 +1,9 @@
package org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent
import org.domaindrivenarchitecture.provs.framework.core.Prov
import org.domaindrivenarchitecture.provs.server.infrastructure.provisionGrafanaAgentForK8s
fun Prov.provisionGrafanaAgent(configResolved: GrafanaAgentConfigResolved) =
provisionGrafanaAgentForK8s(configResolved.user, configResolved.password, configResolved.cluster)

View file

@ -0,0 +1,25 @@
package org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent
import kotlinx.serialization.Serializable
import org.domaindrivenarchitecture.provs.framework.core.Secret
import org.domaindrivenarchitecture.provs.framework.ubuntu.secret.SecretSupplier
@Serializable
data class GrafanaAgentConfig(
val user: String,
val password: SecretSupplier,
val cluster: String
) {
fun resolveSecret(): GrafanaAgentConfigResolved = GrafanaAgentConfigResolved(this)
}
data class GrafanaAgentConfigResolved(val configUnresolved: GrafanaAgentConfig) {
val user: String = configUnresolved.user
val password: Secret = configUnresolved.password.secret()
val cluster: String = configUnresolved.cluster
}
@Serializable
data class GrafanaAgentConfigHolder(
val grafana: GrafanaAgentConfig
)

View file

@ -0,0 +1,64 @@
package org.domaindrivenarchitecture.provs.server.infrastructure
import org.domaindrivenarchitecture.provs.framework.core.Prov
import org.domaindrivenarchitecture.provs.framework.core.Secret
import org.domaindrivenarchitecture.provs.framework.ubuntu.filesystem.base.createFileFromResource
import org.domaindrivenarchitecture.provs.framework.ubuntu.filesystem.base.createFileFromResourceTemplate
import org.domaindrivenarchitecture.provs.server.domain.k3s.FileMode
import java.io.File
private const val grafanaResourceDir = "org/domaindrivenarchitecture/provs/server/infrastructure/grafana/"
fun Prov.provisionGrafanaAgentForK8s(user: String, password: Secret, clusterName: String) = task {
val namespace = "monitoring"
// Create namespace if not yet existing
if (!chk("kubectl get namespace $namespace")) {
cmd("kubectl create namespace $namespace")
}
// Deploy grafana-agent
applyGrafanaFileFromResource(File(k3sManualManifestsDir, "grafana-agent.yaml"))
// Deploy node-exporter
applyGrafanaFileFromResource(File(k3sManualManifestsDir, "node-exporter-daemon-set.yaml"))
// Deploy grafana config
createFileFromResourceTemplate(
k3sManualManifestsDir + "grafana-agent-config-map.yaml",
"grafana-agent-config-map.template.yaml",
resourcePath = grafanaResourceDir,
posixFilePermission = "644",
values = mapOf(
"USERNAME" to user,
"APIKEY" to password.plain(),
"CLUSTERNAME" to clusterName,
)
)
cmd("export NAMESPACE=$namespace && kubectl apply -n \$NAMESPACE -f grafana-agent-config-map.yaml", k3sManualManifestsDir)
// restart grafana-agent
cmd("kubectl -n $namespace rollout restart deployment/grafana-agent")
}
// ============================ private functions =============================
private fun Prov.createGrafanaFileFromResource(
file: File,
posixFilePermission: FileMode? = "644"
) = task {
createFileFromResource(
file.path,
file.name,
grafanaResourceDir,
posixFilePermission,
sudo = true
)
}
private fun Prov.applyGrafanaFileFromResource(file: File, posixFilePermission: String? = "644") = task {
createGrafanaFileFromResource(file, posixFilePermission)
cmd("kubectl apply -f ${file.path}", sudo = true)
}

View file

@ -0,0 +1,30 @@
package org.domaindrivenarchitecture.provs.server.infrastructure
import org.domaindrivenarchitecture.provs.configuration.domain.ConfigFileName
import org.domaindrivenarchitecture.provs.framework.core.readFromFile
import org.domaindrivenarchitecture.provs.framework.core.toYaml
import org.domaindrivenarchitecture.provs.framework.core.yamlToType
import org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent.GrafanaAgentConfigHolder
import org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent.GrafanaAgentConfig
import java.io.File
import java.io.FileWriter
private const val DEFAULT_CONFIG_FILE = "server-config.yaml"
fun findK8sGrafanaConfig(fileName: ConfigFileName? = null): GrafanaAgentConfig? {
val filePath = fileName?.fileName ?: DEFAULT_CONFIG_FILE
// create a default config
return if (File(filePath).exists()) {
readFromFile(filePath).yamlToType<GrafanaAgentConfigHolder>().grafana
} else {
null
}
}
@Suppress("unused")
internal fun writeConfig(config: GrafanaAgentConfigHolder, fileName: String = "grafana-config.yaml") =
FileWriter(fileName).use { it.write(config.toYaml()) }

View file

@ -13,11 +13,12 @@ import java.io.File
// ----------------------------------- directories --------------------------------
private const val k3sResourceDir = "org/domaindrivenarchitecture/provs/server/infrastructure/k3s/"
const val k3sManualManifestsDir = "/etc/rancher/k3s/manifests/"
private const val k8sCredentialsDir = "/etc/kubernetes/"
private const val k3sAutomatedManifestsDir = "/var/lib/rancher/k3s/server/manifests/"
private const val k3sManualManifestsDir = "/etc/rancher/k3s/manifests/"
private const val k8sCredentialsDir = "/etc/kubernetes/"
private const val k3sResourceDir = "org/domaindrivenarchitecture/provs/server/infrastructure/k3s/"
// ----------------------------------- files --------------------------------
@ -146,7 +147,11 @@ fun Prov.provisionK3sApplication(applicationFileName: ApplicationFileName) = tas
}
// ============================ private functions =============================
// ============================ private and internal functions =============================
internal fun Prov.applyK3sFile(file: File) = task {
cmd("kubectl apply -f ${file.path}", sudo = true)
}
private fun Prov.createK3sFileFromResource(
file: File,
@ -192,10 +197,6 @@ private fun Prov.createK3sFileFromResourceTemplate(
)
}
private fun Prov.applyK3sFile(file: File) = task {
cmd("kubectl apply -f ${file.path}", sudo = true)
}
private fun File.templateName(): String {
return this.name.replace(".yaml", ".template.yaml")
}

View file

@ -0,0 +1,112 @@
kind: ConfigMap
metadata:
name: grafana-agent
namespace: monitoring
apiVersion: v1
data:
agent.yaml: |
server:
http_listen_port: 12345
metrics:
wal_directory: /tmp/grafana-agent-wal
global:
scrape_interval: 60s
external_labels:
cluster: $CLUSTERNAME
configs:
- name: integrations
remote_write:
- url: https://prometheus-prod-01-eu-west-0.grafana.net/api/prom/push
basic_auth:
username: $USERNAME
password: $APIKEY
scrape_configs:
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: integrations/kubernetes/cadvisor
kubernetes_sd_configs:
- role: node
metric_relabel_configs:
- source_labels: [__name__]
regex: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|container_memory_swap|kube_statefulset_status_replicas_ready|kube_node_spec_taint|storage_operation_duration_seconds_count|kubelet_pleg_relist_duration_seconds_count|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|kube_daemonset_status_number_available|kubelet_certificate_manager_server_ttl_seconds|container_network_receive_packets_total|namespace_workload_pod|kubelet_pod_worker_duration_seconds_count|kube_statefulset_status_replicas_updated|kube_pod_status_phase|volume_manager_total_volumes|kubelet_running_containers|storage_operation_errors_total|kube_statefulset_status_observed_generation|kubelet_node_config_error|container_fs_reads_bytes_total|kube_horizontalpodautoscaler_status_current_replicas|rest_client_request_duration_seconds_bucket|container_memory_cache|kube_daemonset_updated_number_scheduled|kube_job_spec_completions|kubelet_volume_stats_capacity_bytes|kube_daemonset_status_number_misscheduled|container_memory_rss|namespace_cpu:kube_pod_container_resource_limits:sum|kubelet_volume_stats_inodes_used|container_network_transmit_bytes_total|kubelet_runtime_operations_errors_total|container_fs_writes_total|kube_daemonset_status_current_number_scheduled|kube_node_status_capacity|container_network_receive_packets_dropped_total|storage_operation_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|container_network_receive_bytes_total|kubelet_runtime_operations_total|kubelet_pod_start_duration_seconds_count|kube_horizontalpodautoscaler_status_desired_replicas|container_cpu_usage_seconds_total|up|kube_resourcequota|kubelet_cgroup_manager_duration_seconds_bucket|kube_horizontalpodautoscaler_spec_max_replicas|kubelet_server_expiration_renew_errors|container_fs_writes_bytes_total|kubelet_pod_worker_duration_seconds_bucket|machine_memory_bytes|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|kube_statefulset_status_current_revision|kube_deployment_status_observed_generation|kubelet_volume_stats_inodes|kubelet_volume_stats_available_bytes|kube_deployment_status_replicas_available|container_network_transmit_packets_total|kubelet_pleg_relist_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|container_cpu_cfs_periods_total|process_cpu_seconds_total|kubelet_pleg_relist_interval_seconds_bucket|kube_horizontalpodautoscaler_spec_min_replicas|kube_statefulset_status_replicas|kube_deployment_spec_replicas|namespace_workload_pod:kube_pod_owner:relabel|kube_pod_owner|node_namespace_pod_container:container_memory_working_set_bytes|kubelet_node_name|kube_job_failed|container_memory_working_set_bytes|kubelet_runtime_operations_duration_seconds_bucket|namespace_memory:kube_pod_container_resource_requests:sum|kube_node_info|kubernetes_build_info|kube_statefulset_status_update_revision|kube_pod_container_resource_limits|kubelet_running_pods|kube_statefulset_replicas|kube_namespace_created|node_namespace_pod_container:container_memory_rss|node_namespace_pod_container:container_memory_cache|kube_node_status_condition|kube_pod_container_resource_requests|kubelet_running_pod_count|namespace_memory:kube_pod_container_resource_limits:sum|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_pod_container_status_waiting_reason|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_running_container_count|node_namespace_pod_container:container_memory_swap|kube_deployment_status_replicas_updated|process_resident_memory_bytes|rest_client_requests_total|kube_node_status_allocatable|kube_statefulset_metadata_generation|kube_deployment_metadata_generation|container_network_transmit_packets_dropped_total|kube_pod_info|kubelet_cgroup_manager_duration_seconds_count|container_fs_reads_total|kube_daemonset_status_desired_number_scheduled|container_cpu_cfs_throttled_periods_total|kube_job_status_succeeded|kubelet_certificate_manager_client_ttl_seconds|kube_replicaset_owner|go_goroutines|namespace_cpu:kube_pod_container_resource_requests:sum
action: keep
relabel_configs:
- replacement: kubernetes.default.svc.cluster.local:443
target_label: __address__
- regex: (.+)
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
source_labels:
- __meta_kubernetes_node_name
target_label: __metrics_path__
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: false
server_name: kubernetes
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: integrations/kubernetes/kubelet
kubernetes_sd_configs:
- role: node
metric_relabel_configs:
- source_labels: [__name__]
regex: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|container_memory_swap|kube_statefulset_status_replicas_ready|kube_node_spec_taint|storage_operation_duration_seconds_count|kubelet_pleg_relist_duration_seconds_count|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|kube_daemonset_status_number_available|kubelet_certificate_manager_server_ttl_seconds|container_network_receive_packets_total|namespace_workload_pod|kubelet_pod_worker_duration_seconds_count|kube_statefulset_status_replicas_updated|kube_pod_status_phase|volume_manager_total_volumes|kubelet_running_containers|storage_operation_errors_total|kube_statefulset_status_observed_generation|kubelet_node_config_error|container_fs_reads_bytes_total|kube_horizontalpodautoscaler_status_current_replicas|rest_client_request_duration_seconds_bucket|container_memory_cache|kube_daemonset_updated_number_scheduled|kube_job_spec_completions|kubelet_volume_stats_capacity_bytes|kube_daemonset_status_number_misscheduled|container_memory_rss|namespace_cpu:kube_pod_container_resource_limits:sum|kubelet_volume_stats_inodes_used|container_network_transmit_bytes_total|kubelet_runtime_operations_errors_total|container_fs_writes_total|kube_daemonset_status_current_number_scheduled|kube_node_status_capacity|container_network_receive_packets_dropped_total|storage_operation_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|container_network_receive_bytes_total|kubelet_runtime_operations_total|kubelet_pod_start_duration_seconds_count|kube_horizontalpodautoscaler_status_desired_replicas|container_cpu_usage_seconds_total|up|kube_resourcequota|kubelet_cgroup_manager_duration_seconds_bucket|kube_horizontalpodautoscaler_spec_max_replicas|kubelet_server_expiration_renew_errors|container_fs_writes_bytes_total|kubelet_pod_worker_duration_seconds_bucket|machine_memory_bytes|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|kube_statefulset_status_current_revision|kube_deployment_status_observed_generation|kubelet_volume_stats_inodes|kubelet_volume_stats_available_bytes|kube_deployment_status_replicas_available|container_network_transmit_packets_total|kubelet_pleg_relist_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|container_cpu_cfs_periods_total|process_cpu_seconds_total|kubelet_pleg_relist_interval_seconds_bucket|kube_horizontalpodautoscaler_spec_min_replicas|kube_statefulset_status_replicas|kube_deployment_spec_replicas|namespace_workload_pod:kube_pod_owner:relabel|kube_pod_owner|node_namespace_pod_container:container_memory_working_set_bytes|kubelet_node_name|kube_job_failed|container_memory_working_set_bytes|kubelet_runtime_operations_duration_seconds_bucket|namespace_memory:kube_pod_container_resource_requests:sum|kube_node_info|kubernetes_build_info|kube_statefulset_status_update_revision|kube_pod_container_resource_limits|kubelet_running_pods|kube_statefulset_replicas|kube_namespace_created|node_namespace_pod_container:container_memory_rss|node_namespace_pod_container:container_memory_cache|kube_node_status_condition|kube_pod_container_resource_requests|kubelet_running_pod_count|namespace_memory:kube_pod_container_resource_limits:sum|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_pod_container_status_waiting_reason|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_running_container_count|node_namespace_pod_container:container_memory_swap|kube_deployment_status_replicas_updated|process_resident_memory_bytes|rest_client_requests_total|kube_node_status_allocatable|kube_statefulset_metadata_generation|kube_deployment_metadata_generation|container_network_transmit_packets_dropped_total|kube_pod_info|kubelet_cgroup_manager_duration_seconds_count|container_fs_reads_total|kube_daemonset_status_desired_number_scheduled|container_cpu_cfs_throttled_periods_total|kube_job_status_succeeded|kubelet_certificate_manager_client_ttl_seconds|kube_replicaset_owner|go_goroutines|namespace_cpu:kube_pod_container_resource_requests:sum
action: keep
relabel_configs:
- replacement: kubernetes.default.svc.cluster.local:443
target_label: __address__
- regex: (.+)
replacement: /api/v1/nodes/${1}/proxy/metrics
source_labels:
- __meta_kubernetes_node_name
target_label: __metrics_path__
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: false
server_name: kubernetes
- job_name: integrations/kubernetes/kube-state-metrics
kubernetes_sd_configs:
- role: service
metric_relabel_configs:
- source_labels: [__name__]
regex: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|container_memory_swap|kube_statefulset_status_replicas_ready|kube_node_spec_taint|storage_operation_duration_seconds_count|kubelet_pleg_relist_duration_seconds_count|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|kube_daemonset_status_number_available|kubelet_certificate_manager_server_ttl_seconds|container_network_receive_packets_total|namespace_workload_pod|kubelet_pod_worker_duration_seconds_count|kube_statefulset_status_replicas_updated|kube_pod_status_phase|volume_manager_total_volumes|kubelet_running_containers|storage_operation_errors_total|kube_statefulset_status_observed_generation|kubelet_node_config_error|container_fs_reads_bytes_total|kube_horizontalpodautoscaler_status_current_replicas|rest_client_request_duration_seconds_bucket|container_memory_cache|kube_daemonset_updated_number_scheduled|kube_job_spec_completions|kubelet_volume_stats_capacity_bytes|kube_daemonset_status_number_misscheduled|container_memory_rss|namespace_cpu:kube_pod_container_resource_limits:sum|kubelet_volume_stats_inodes_used|container_network_transmit_bytes_total|kubelet_runtime_operations_errors_total|container_fs_writes_total|kube_daemonset_status_current_number_scheduled|kube_node_status_capacity|container_network_receive_packets_dropped_total|storage_operation_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|container_network_receive_bytes_total|kubelet_runtime_operations_total|kubelet_pod_start_duration_seconds_count|kube_horizontalpodautoscaler_status_desired_replicas|container_cpu_usage_seconds_total|up|kube_resourcequota|kubelet_cgroup_manager_duration_seconds_bucket|kube_horizontalpodautoscaler_spec_max_replicas|kubelet_server_expiration_renew_errors|container_fs_writes_bytes_total|kubelet_pod_worker_duration_seconds_bucket|machine_memory_bytes|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|kube_statefulset_status_current_revision|kube_deployment_status_observed_generation|kubelet_volume_stats_inodes|kubelet_volume_stats_available_bytes|kube_deployment_status_replicas_available|container_network_transmit_packets_total|kubelet_pleg_relist_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|container_cpu_cfs_periods_total|process_cpu_seconds_total|kubelet_pleg_relist_interval_seconds_bucket|kube_horizontalpodautoscaler_spec_min_replicas|kube_statefulset_status_replicas|kube_deployment_spec_replicas|namespace_workload_pod:kube_pod_owner:relabel|kube_pod_owner|node_namespace_pod_container:container_memory_working_set_bytes|kubelet_node_name|kube_job_failed|container_memory_working_set_bytes|kubelet_runtime_operations_duration_seconds_bucket|namespace_memory:kube_pod_container_resource_requests:sum|kube_node_info|kubernetes_build_info|kube_statefulset_status_update_revision|kube_pod_container_resource_limits|kubelet_running_pods|kube_statefulset_replicas|kube_namespace_created|node_namespace_pod_container:container_memory_rss|node_namespace_pod_container:container_memory_cache|kube_node_status_condition|kube_pod_container_resource_requests|kubelet_running_pod_count|namespace_memory:kube_pod_container_resource_limits:sum|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_pod_container_status_waiting_reason|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_running_container_count|node_namespace_pod_container:container_memory_swap|kube_deployment_status_replicas_updated|process_resident_memory_bytes|rest_client_requests_total|kube_node_status_allocatable|kube_statefulset_metadata_generation|kube_deployment_metadata_generation|container_network_transmit_packets_dropped_total|kube_pod_info|kubelet_cgroup_manager_duration_seconds_count|container_fs_reads_total|kube_daemonset_status_desired_number_scheduled|container_cpu_cfs_throttled_periods_total|kube_job_status_succeeded|kubelet_certificate_manager_client_ttl_seconds|kube_replicaset_owner|go_goroutines|namespace_cpu:kube_pod_container_resource_requests:sum
action: keep
relabel_configs:
- action: keep
regex: ksm-kube-state-metrics
source_labels:
- __meta_kubernetes_service_name
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: 'integrations/node_exporter'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_endpoints_name]
regex: 'node-exporter'
action: keep
# relabel 'instance'
- source_labels: [__meta_kubernetes_pod_node_name]
action: replace
target_label: instance
integrations:
prometheus_remote_write:
- url: https://prometheus-prod-01-eu-west-0.grafana.net/api/prom/push
basic_auth:
username: $USERNAME
password: $APIKEY
# logs example
# logs:
# configs:
# - name: integrations
# clients:
# - url: https://logs-prod-eu-west-0.grafana.net/api/prom/push
# basic_auth:
# username: 195593
# password: $APIKEY
# external_labels:
# cluster: cloud
# positions:
# filename: /tmp/positions.yaml
# target_config:
# sync_period: 10s

View file

@ -0,0 +1,83 @@
# https://raw.githubusercontent.com/grafana/agent/v0.23.0/production/kubernetes/agent-bare.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: grafana-agent
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: grafana-agent
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- nonResourceURLs:
- /metrics
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: grafana-agent
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: grafana-agent
subjects:
- kind: ServiceAccount
name: grafana-agent
namespace: monitoring
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana-agent
namespace: monitoring
spec:
minReadySeconds: 10
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
name: grafana-agent
template:
metadata:
labels:
name: grafana-agent
spec:
containers:
- args:
- -config.file=/etc/agent/agent.yaml
command:
- /bin/agent
env:
- name: HOSTNAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
image: grafana/agent:v0.23.0
imagePullPolicy: IfNotPresent
name: agent
ports:
- containerPort: 12345
name: http-metrics
volumeMounts:
- mountPath: /etc/agent
name: grafana-agent
serviceAccount: grafana-agent
volumes:
- configMap:
name: grafana-agent
name: grafana-agent

View file

@ -0,0 +1,96 @@
# see https://devopscube.com/node-exporter-kubernetes/
# and https://www.opsramp.com/prometheus-monitoring/prometheus-node-exporter/
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
name: node-exporter
namespace: monitoring
spec:
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
template:
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
spec:
containers:
- args:
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --no-collector.wifi
- --no-collector.hwmon
- --no-collector.infiniband
- --no-collector.filefd
- --no-collector.ipvs
- --no-collector.mdadm
- --no-collector.netclass
- --no-collector.netstat
- --no-collector.nfsd
- --no-collector.nvme
- --no-collector.powersupplyclass
- --no-collector.pressure
- --no-collector.rapl
- --no-collector.schedstat
- --no-collector.sockstat
- --no-collector.softnet
- --no-collector.tapestats
- --no-collector.thermal_zone
- --no-collector.udp_queues
- --no-collector.xfs
- --no-collector.zfs
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
- --collector.netclass.ignored-devices=^(veth.*)$
name: node-exporter
image: prom/node-exporter
ports:
- containerPort: 9100
protocol: TCP
resources:
limits:
cpu: 500m
memory: 180Mi
requests:
cpu: 102m
memory: 180Mi
volumeMounts:
- mountPath: /host/sys
mountPropagation: HostToContainer
name: sys
readOnly: true
- mountPath: /host/root
mountPropagation: HostToContainer
name: root
readOnly: true
volumes:
- hostPath:
path: /sys
name: sys
- hostPath:
path: /
name: root
---
kind: Service
apiVersion: v1
metadata:
name: node-exporter
namespace: monitoring
labels:
instance: primary
annotations:
prometheus.io/scrape: 'true'
prometheus.io/port: '9100'
spec:
selector:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
ports:
- name: node-exporter
protocol: TCP
port: 9100
targetPort: 9100

View file

@ -0,0 +1,22 @@
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
name: prometheus
namespace: monitoring
labels:
app: prometheus
spec:
image: quay.io/prometheus/prometheus:v2.22.1
nodeSelector:
kubernetes.io/os: linux
replicas: 2
resources:
requests:
memory: 400Mi
securityContext:
fsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
serviceAccountName: prometheus
version: v2.22.1
serviceMonitorSelector: {}

View file

@ -0,0 +1,42 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/metrics
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitoring

View file

@ -0,0 +1,15 @@
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: monitoring
labels:
app: prometheus
spec:
ports:
- name: web
port: 9090
targetPort: web
selector:
app: prometheus
sessionAffinity: ClientIP

View file

@ -0,0 +1,14 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: prometheus-self
namespace: monitoring
labels:
app: prometheus
spec:
endpoints:
- interval: 30s
port: web
selector:
matchLabels:
app: prometheus