add grafana-agent
This commit is contained in:
parent
968c5d01d6
commit
85697f43f9
15 changed files with 539 additions and 25 deletions
|
@ -11,7 +11,7 @@ internal val configDir = "/etc/prometheus/"
|
|||
internal val configFile = "prometheus.yml"
|
||||
|
||||
|
||||
fun Prov.configurePrometheusDocker(config: String = prometheusDefaultConfig) = task {
|
||||
fun Prov.configurePrometheusDocker(config: String = prometheusDefaultConfig()) = task {
|
||||
createDirs(configDir, sudo = true)
|
||||
createFile(configDir + configFile, config, sudo = true)
|
||||
}
|
||||
|
@ -51,25 +51,22 @@ fun Prov.runPrometheusDocker(nginxHost: String? = null) = task {
|
|||
}
|
||||
|
||||
|
||||
private const val prometheusDefaultConfig =
|
||||
private fun prometheusDefaultConfig() =
|
||||
"""
|
||||
global:
|
||||
scrape_interval: 15s # By default, scrape targets every 15 seconds.
|
||||
|
||||
# Attach these labels to any time series or alerts when communicating with
|
||||
# external systems (federation, remote storage, Alertmanager).
|
||||
external_labels:
|
||||
monitor: 'codelab-monitor'
|
||||
|
||||
# A scrape configuration containing exactly one endpoint to scrape:
|
||||
# Here it's Prometheus itself.
|
||||
scrape_configs:
|
||||
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
|
||||
- job_name: 'prometheus'
|
||||
|
||||
# Override the global default and scrape targets from this job every 5 seconds.
|
||||
scrape_interval: 5s
|
||||
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
remote_write:
|
||||
- url: "<Your Metrics instance remote_write endpoint>"
|
||||
basic_auth:
|
||||
username: "your grafana username"
|
||||
password: "your Grafana API key"
|
||||
"""
|
||||
|
|
|
@ -30,10 +30,8 @@ enum class SecretSourceType() {
|
|||
|
||||
|
||||
@Serializable
|
||||
@Suppress("unused") // for use in other projects
|
||||
class SecretSupplier(private val source: SecretSourceType, val parameter: String) {
|
||||
fun secret(): Secret {
|
||||
return source.secret(parameter)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,15 +1,16 @@
|
|||
package org.domaindrivenarchitecture.provs.server.domain.k3s
|
||||
|
||||
import org.domaindrivenarchitecture.provs.framework.core.Prov
|
||||
import org.domaindrivenarchitecture.provs.framework.core.ProvResult
|
||||
import org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent.GrafanaAgentConfigResolved
|
||||
import org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent.provisionGrafanaAgent
|
||||
import org.domaindrivenarchitecture.provs.server.infrastructure.*
|
||||
import org.domaindrivenarchitecture.provs.server.infrastructure.getK3sConfig
|
||||
|
||||
/**
|
||||
* Installs a k3s server.
|
||||
*/
|
||||
fun Prov.provisionK3s(cli: K3sCliCommand) = task {
|
||||
val k3sConfig: K3sConfig = getK3sConfig(cli.configFileName)
|
||||
val grafanaConfigResolved: GrafanaAgentConfigResolved? = findK8sGrafanaConfig(cli.configFileName)?.resolveSecret()
|
||||
|
||||
provisionNetwork(k3sConfig)
|
||||
if (k3sConfig.reprovision && testConfigExists()) {
|
||||
|
@ -21,11 +22,16 @@ fun Prov.provisionK3s(cli: K3sCliCommand) = task {
|
|||
if (k3sConfig.certmanager != null) {
|
||||
provisionK3sCertManager(k3sConfig.certmanager)
|
||||
}
|
||||
|
||||
if (k3sConfig.echo == true) {
|
||||
provisionK3sEcho(k3sConfig.fqdn, k3sConfig.certmanager?.letsencryptEndpoint)
|
||||
}
|
||||
|
||||
if (grafanaConfigResolved != null) {
|
||||
provisionGrafanaAgent(grafanaConfigResolved)
|
||||
}
|
||||
|
||||
if (cli.applicationFileName != null) {
|
||||
provisionK3sApplication(cli.applicationFileName)
|
||||
}
|
||||
ProvResult(true)
|
||||
}
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
package org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent
|
||||
|
||||
import org.domaindrivenarchitecture.provs.framework.core.Prov
|
||||
import org.domaindrivenarchitecture.provs.server.infrastructure.provisionGrafanaAgentForK8s
|
||||
|
||||
|
||||
fun Prov.provisionGrafanaAgent(configResolved: GrafanaAgentConfigResolved) =
|
||||
provisionGrafanaAgentForK8s(configResolved.user, configResolved.password, configResolved.cluster)
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
package org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent
|
||||
|
||||
import kotlinx.serialization.Serializable
|
||||
import org.domaindrivenarchitecture.provs.framework.core.Secret
|
||||
import org.domaindrivenarchitecture.provs.framework.ubuntu.secret.SecretSupplier
|
||||
|
||||
@Serializable
|
||||
data class GrafanaAgentConfig(
|
||||
val user: String,
|
||||
val password: SecretSupplier,
|
||||
val cluster: String
|
||||
) {
|
||||
fun resolveSecret(): GrafanaAgentConfigResolved = GrafanaAgentConfigResolved(this)
|
||||
}
|
||||
|
||||
data class GrafanaAgentConfigResolved(val configUnresolved: GrafanaAgentConfig) {
|
||||
val user: String = configUnresolved.user
|
||||
val password: Secret = configUnresolved.password.secret()
|
||||
val cluster: String = configUnresolved.cluster
|
||||
}
|
||||
|
||||
@Serializable
|
||||
data class GrafanaAgentConfigHolder(
|
||||
val grafana: GrafanaAgentConfig
|
||||
)
|
|
@ -0,0 +1,64 @@
|
|||
package org.domaindrivenarchitecture.provs.server.infrastructure
|
||||
|
||||
import org.domaindrivenarchitecture.provs.framework.core.Prov
|
||||
import org.domaindrivenarchitecture.provs.framework.core.Secret
|
||||
import org.domaindrivenarchitecture.provs.framework.ubuntu.filesystem.base.createFileFromResource
|
||||
import org.domaindrivenarchitecture.provs.framework.ubuntu.filesystem.base.createFileFromResourceTemplate
|
||||
import org.domaindrivenarchitecture.provs.server.domain.k3s.FileMode
|
||||
import java.io.File
|
||||
|
||||
|
||||
private const val grafanaResourceDir = "org/domaindrivenarchitecture/provs/server/infrastructure/grafana/"
|
||||
|
||||
|
||||
fun Prov.provisionGrafanaAgentForK8s(user: String, password: Secret, clusterName: String) = task {
|
||||
val namespace = "monitoring"
|
||||
|
||||
// Create namespace if not yet existing
|
||||
if (!chk("kubectl get namespace $namespace")) {
|
||||
cmd("kubectl create namespace $namespace")
|
||||
}
|
||||
|
||||
// Deploy grafana-agent
|
||||
applyGrafanaFileFromResource(File(k3sManualManifestsDir, "grafana-agent.yaml"))
|
||||
|
||||
// Deploy node-exporter
|
||||
applyGrafanaFileFromResource(File(k3sManualManifestsDir, "node-exporter-daemon-set.yaml"))
|
||||
|
||||
// Deploy grafana config
|
||||
createFileFromResourceTemplate(
|
||||
k3sManualManifestsDir + "grafana-agent-config-map.yaml",
|
||||
"grafana-agent-config-map.template.yaml",
|
||||
resourcePath = grafanaResourceDir,
|
||||
posixFilePermission = "644",
|
||||
values = mapOf(
|
||||
"USERNAME" to user,
|
||||
"APIKEY" to password.plain(),
|
||||
"CLUSTERNAME" to clusterName,
|
||||
)
|
||||
)
|
||||
cmd("export NAMESPACE=$namespace && kubectl apply -n \$NAMESPACE -f grafana-agent-config-map.yaml", k3sManualManifestsDir)
|
||||
|
||||
// restart grafana-agent
|
||||
cmd("kubectl -n $namespace rollout restart deployment/grafana-agent")
|
||||
}
|
||||
|
||||
// ============================ private functions =============================
|
||||
|
||||
private fun Prov.createGrafanaFileFromResource(
|
||||
file: File,
|
||||
posixFilePermission: FileMode? = "644"
|
||||
) = task {
|
||||
createFileFromResource(
|
||||
file.path,
|
||||
file.name,
|
||||
grafanaResourceDir,
|
||||
posixFilePermission,
|
||||
sudo = true
|
||||
)
|
||||
}
|
||||
|
||||
private fun Prov.applyGrafanaFileFromResource(file: File, posixFilePermission: String? = "644") = task {
|
||||
createGrafanaFileFromResource(file, posixFilePermission)
|
||||
cmd("kubectl apply -f ${file.path}", sudo = true)
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
package org.domaindrivenarchitecture.provs.server.infrastructure
|
||||
|
||||
import org.domaindrivenarchitecture.provs.configuration.domain.ConfigFileName
|
||||
import org.domaindrivenarchitecture.provs.framework.core.readFromFile
|
||||
import org.domaindrivenarchitecture.provs.framework.core.toYaml
|
||||
import org.domaindrivenarchitecture.provs.framework.core.yamlToType
|
||||
import org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent.GrafanaAgentConfigHolder
|
||||
import org.domaindrivenarchitecture.provs.server.domain.k8s_grafana_agent.GrafanaAgentConfig
|
||||
import java.io.File
|
||||
import java.io.FileWriter
|
||||
|
||||
|
||||
private const val DEFAULT_CONFIG_FILE = "server-config.yaml"
|
||||
|
||||
|
||||
fun findK8sGrafanaConfig(fileName: ConfigFileName? = null): GrafanaAgentConfig? {
|
||||
val filePath = fileName?.fileName ?: DEFAULT_CONFIG_FILE
|
||||
|
||||
// create a default config
|
||||
return if (File(filePath).exists()) {
|
||||
readFromFile(filePath).yamlToType<GrafanaAgentConfigHolder>().grafana
|
||||
} else {
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Suppress("unused")
|
||||
internal fun writeConfig(config: GrafanaAgentConfigHolder, fileName: String = "grafana-config.yaml") =
|
||||
FileWriter(fileName).use { it.write(config.toYaml()) }
|
|
@ -13,11 +13,12 @@ import java.io.File
|
|||
|
||||
// ----------------------------------- directories --------------------------------
|
||||
|
||||
private const val k3sResourceDir = "org/domaindrivenarchitecture/provs/server/infrastructure/k3s/"
|
||||
const val k3sManualManifestsDir = "/etc/rancher/k3s/manifests/"
|
||||
|
||||
private const val k8sCredentialsDir = "/etc/kubernetes/"
|
||||
private const val k3sAutomatedManifestsDir = "/var/lib/rancher/k3s/server/manifests/"
|
||||
private const val k3sManualManifestsDir = "/etc/rancher/k3s/manifests/"
|
||||
private const val k8sCredentialsDir = "/etc/kubernetes/"
|
||||
|
||||
private const val k3sResourceDir = "org/domaindrivenarchitecture/provs/server/infrastructure/k3s/"
|
||||
|
||||
// ----------------------------------- files --------------------------------
|
||||
|
||||
|
@ -146,7 +147,11 @@ fun Prov.provisionK3sApplication(applicationFileName: ApplicationFileName) = tas
|
|||
}
|
||||
|
||||
|
||||
// ============================ private functions =============================
|
||||
// ============================ private and internal functions =============================
|
||||
|
||||
internal fun Prov.applyK3sFile(file: File) = task {
|
||||
cmd("kubectl apply -f ${file.path}", sudo = true)
|
||||
}
|
||||
|
||||
private fun Prov.createK3sFileFromResource(
|
||||
file: File,
|
||||
|
@ -192,10 +197,6 @@ private fun Prov.createK3sFileFromResourceTemplate(
|
|||
)
|
||||
}
|
||||
|
||||
private fun Prov.applyK3sFile(file: File) = task {
|
||||
cmd("kubectl apply -f ${file.path}", sudo = true)
|
||||
}
|
||||
|
||||
private fun File.templateName(): String {
|
||||
return this.name.replace(".yaml", ".template.yaml")
|
||||
}
|
||||
|
|
|
@ -0,0 +1,112 @@
|
|||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-agent
|
||||
namespace: monitoring
|
||||
apiVersion: v1
|
||||
data:
|
||||
agent.yaml: |
|
||||
server:
|
||||
http_listen_port: 12345
|
||||
metrics:
|
||||
wal_directory: /tmp/grafana-agent-wal
|
||||
global:
|
||||
scrape_interval: 60s
|
||||
external_labels:
|
||||
cluster: $CLUSTERNAME
|
||||
configs:
|
||||
- name: integrations
|
||||
remote_write:
|
||||
- url: https://prometheus-prod-01-eu-west-0.grafana.net/api/prom/push
|
||||
basic_auth:
|
||||
username: $USERNAME
|
||||
password: $APIKEY
|
||||
scrape_configs:
|
||||
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
job_name: integrations/kubernetes/cadvisor
|
||||
kubernetes_sd_configs:
|
||||
- role: node
|
||||
metric_relabel_configs:
|
||||
- source_labels: [__name__]
|
||||
regex: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|container_memory_swap|kube_statefulset_status_replicas_ready|kube_node_spec_taint|storage_operation_duration_seconds_count|kubelet_pleg_relist_duration_seconds_count|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|kube_daemonset_status_number_available|kubelet_certificate_manager_server_ttl_seconds|container_network_receive_packets_total|namespace_workload_pod|kubelet_pod_worker_duration_seconds_count|kube_statefulset_status_replicas_updated|kube_pod_status_phase|volume_manager_total_volumes|kubelet_running_containers|storage_operation_errors_total|kube_statefulset_status_observed_generation|kubelet_node_config_error|container_fs_reads_bytes_total|kube_horizontalpodautoscaler_status_current_replicas|rest_client_request_duration_seconds_bucket|container_memory_cache|kube_daemonset_updated_number_scheduled|kube_job_spec_completions|kubelet_volume_stats_capacity_bytes|kube_daemonset_status_number_misscheduled|container_memory_rss|namespace_cpu:kube_pod_container_resource_limits:sum|kubelet_volume_stats_inodes_used|container_network_transmit_bytes_total|kubelet_runtime_operations_errors_total|container_fs_writes_total|kube_daemonset_status_current_number_scheduled|kube_node_status_capacity|container_network_receive_packets_dropped_total|storage_operation_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|container_network_receive_bytes_total|kubelet_runtime_operations_total|kubelet_pod_start_duration_seconds_count|kube_horizontalpodautoscaler_status_desired_replicas|container_cpu_usage_seconds_total|up|kube_resourcequota|kubelet_cgroup_manager_duration_seconds_bucket|kube_horizontalpodautoscaler_spec_max_replicas|kubelet_server_expiration_renew_errors|container_fs_writes_bytes_total|kubelet_pod_worker_duration_seconds_bucket|machine_memory_bytes|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|kube_statefulset_status_current_revision|kube_deployment_status_observed_generation|kubelet_volume_stats_inodes|kubelet_volume_stats_available_bytes|kube_deployment_status_replicas_available|container_network_transmit_packets_total|kubelet_pleg_relist_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|container_cpu_cfs_periods_total|process_cpu_seconds_total|kubelet_pleg_relist_interval_seconds_bucket|kube_horizontalpodautoscaler_spec_min_replicas|kube_statefulset_status_replicas|kube_deployment_spec_replicas|namespace_workload_pod:kube_pod_owner:relabel|kube_pod_owner|node_namespace_pod_container:container_memory_working_set_bytes|kubelet_node_name|kube_job_failed|container_memory_working_set_bytes|kubelet_runtime_operations_duration_seconds_bucket|namespace_memory:kube_pod_container_resource_requests:sum|kube_node_info|kubernetes_build_info|kube_statefulset_status_update_revision|kube_pod_container_resource_limits|kubelet_running_pods|kube_statefulset_replicas|kube_namespace_created|node_namespace_pod_container:container_memory_rss|node_namespace_pod_container:container_memory_cache|kube_node_status_condition|kube_pod_container_resource_requests|kubelet_running_pod_count|namespace_memory:kube_pod_container_resource_limits:sum|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_pod_container_status_waiting_reason|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_running_container_count|node_namespace_pod_container:container_memory_swap|kube_deployment_status_replicas_updated|process_resident_memory_bytes|rest_client_requests_total|kube_node_status_allocatable|kube_statefulset_metadata_generation|kube_deployment_metadata_generation|container_network_transmit_packets_dropped_total|kube_pod_info|kubelet_cgroup_manager_duration_seconds_count|container_fs_reads_total|kube_daemonset_status_desired_number_scheduled|container_cpu_cfs_throttled_periods_total|kube_job_status_succeeded|kubelet_certificate_manager_client_ttl_seconds|kube_replicaset_owner|go_goroutines|namespace_cpu:kube_pod_container_resource_requests:sum
|
||||
action: keep
|
||||
relabel_configs:
|
||||
- replacement: kubernetes.default.svc.cluster.local:443
|
||||
target_label: __address__
|
||||
- regex: (.+)
|
||||
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
|
||||
source_labels:
|
||||
- __meta_kubernetes_node_name
|
||||
target_label: __metrics_path__
|
||||
scheme: https
|
||||
tls_config:
|
||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
insecure_skip_verify: false
|
||||
server_name: kubernetes
|
||||
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
job_name: integrations/kubernetes/kubelet
|
||||
kubernetes_sd_configs:
|
||||
- role: node
|
||||
metric_relabel_configs:
|
||||
- source_labels: [__name__]
|
||||
regex: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|container_memory_swap|kube_statefulset_status_replicas_ready|kube_node_spec_taint|storage_operation_duration_seconds_count|kubelet_pleg_relist_duration_seconds_count|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|kube_daemonset_status_number_available|kubelet_certificate_manager_server_ttl_seconds|container_network_receive_packets_total|namespace_workload_pod|kubelet_pod_worker_duration_seconds_count|kube_statefulset_status_replicas_updated|kube_pod_status_phase|volume_manager_total_volumes|kubelet_running_containers|storage_operation_errors_total|kube_statefulset_status_observed_generation|kubelet_node_config_error|container_fs_reads_bytes_total|kube_horizontalpodautoscaler_status_current_replicas|rest_client_request_duration_seconds_bucket|container_memory_cache|kube_daemonset_updated_number_scheduled|kube_job_spec_completions|kubelet_volume_stats_capacity_bytes|kube_daemonset_status_number_misscheduled|container_memory_rss|namespace_cpu:kube_pod_container_resource_limits:sum|kubelet_volume_stats_inodes_used|container_network_transmit_bytes_total|kubelet_runtime_operations_errors_total|container_fs_writes_total|kube_daemonset_status_current_number_scheduled|kube_node_status_capacity|container_network_receive_packets_dropped_total|storage_operation_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|container_network_receive_bytes_total|kubelet_runtime_operations_total|kubelet_pod_start_duration_seconds_count|kube_horizontalpodautoscaler_status_desired_replicas|container_cpu_usage_seconds_total|up|kube_resourcequota|kubelet_cgroup_manager_duration_seconds_bucket|kube_horizontalpodautoscaler_spec_max_replicas|kubelet_server_expiration_renew_errors|container_fs_writes_bytes_total|kubelet_pod_worker_duration_seconds_bucket|machine_memory_bytes|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|kube_statefulset_status_current_revision|kube_deployment_status_observed_generation|kubelet_volume_stats_inodes|kubelet_volume_stats_available_bytes|kube_deployment_status_replicas_available|container_network_transmit_packets_total|kubelet_pleg_relist_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|container_cpu_cfs_periods_total|process_cpu_seconds_total|kubelet_pleg_relist_interval_seconds_bucket|kube_horizontalpodautoscaler_spec_min_replicas|kube_statefulset_status_replicas|kube_deployment_spec_replicas|namespace_workload_pod:kube_pod_owner:relabel|kube_pod_owner|node_namespace_pod_container:container_memory_working_set_bytes|kubelet_node_name|kube_job_failed|container_memory_working_set_bytes|kubelet_runtime_operations_duration_seconds_bucket|namespace_memory:kube_pod_container_resource_requests:sum|kube_node_info|kubernetes_build_info|kube_statefulset_status_update_revision|kube_pod_container_resource_limits|kubelet_running_pods|kube_statefulset_replicas|kube_namespace_created|node_namespace_pod_container:container_memory_rss|node_namespace_pod_container:container_memory_cache|kube_node_status_condition|kube_pod_container_resource_requests|kubelet_running_pod_count|namespace_memory:kube_pod_container_resource_limits:sum|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_pod_container_status_waiting_reason|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_running_container_count|node_namespace_pod_container:container_memory_swap|kube_deployment_status_replicas_updated|process_resident_memory_bytes|rest_client_requests_total|kube_node_status_allocatable|kube_statefulset_metadata_generation|kube_deployment_metadata_generation|container_network_transmit_packets_dropped_total|kube_pod_info|kubelet_cgroup_manager_duration_seconds_count|container_fs_reads_total|kube_daemonset_status_desired_number_scheduled|container_cpu_cfs_throttled_periods_total|kube_job_status_succeeded|kubelet_certificate_manager_client_ttl_seconds|kube_replicaset_owner|go_goroutines|namespace_cpu:kube_pod_container_resource_requests:sum
|
||||
action: keep
|
||||
relabel_configs:
|
||||
- replacement: kubernetes.default.svc.cluster.local:443
|
||||
target_label: __address__
|
||||
- regex: (.+)
|
||||
replacement: /api/v1/nodes/${1}/proxy/metrics
|
||||
source_labels:
|
||||
- __meta_kubernetes_node_name
|
||||
target_label: __metrics_path__
|
||||
scheme: https
|
||||
tls_config:
|
||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
insecure_skip_verify: false
|
||||
server_name: kubernetes
|
||||
- job_name: integrations/kubernetes/kube-state-metrics
|
||||
kubernetes_sd_configs:
|
||||
- role: service
|
||||
metric_relabel_configs:
|
||||
- source_labels: [__name__]
|
||||
regex: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|container_memory_swap|kube_statefulset_status_replicas_ready|kube_node_spec_taint|storage_operation_duration_seconds_count|kubelet_pleg_relist_duration_seconds_count|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|kube_daemonset_status_number_available|kubelet_certificate_manager_server_ttl_seconds|container_network_receive_packets_total|namespace_workload_pod|kubelet_pod_worker_duration_seconds_count|kube_statefulset_status_replicas_updated|kube_pod_status_phase|volume_manager_total_volumes|kubelet_running_containers|storage_operation_errors_total|kube_statefulset_status_observed_generation|kubelet_node_config_error|container_fs_reads_bytes_total|kube_horizontalpodautoscaler_status_current_replicas|rest_client_request_duration_seconds_bucket|container_memory_cache|kube_daemonset_updated_number_scheduled|kube_job_spec_completions|kubelet_volume_stats_capacity_bytes|kube_daemonset_status_number_misscheduled|container_memory_rss|namespace_cpu:kube_pod_container_resource_limits:sum|kubelet_volume_stats_inodes_used|container_network_transmit_bytes_total|kubelet_runtime_operations_errors_total|container_fs_writes_total|kube_daemonset_status_current_number_scheduled|kube_node_status_capacity|container_network_receive_packets_dropped_total|storage_operation_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|container_network_receive_bytes_total|kubelet_runtime_operations_total|kubelet_pod_start_duration_seconds_count|kube_horizontalpodautoscaler_status_desired_replicas|container_cpu_usage_seconds_total|up|kube_resourcequota|kubelet_cgroup_manager_duration_seconds_bucket|kube_horizontalpodautoscaler_spec_max_replicas|kubelet_server_expiration_renew_errors|container_fs_writes_bytes_total|kubelet_pod_worker_duration_seconds_bucket|machine_memory_bytes|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|kube_statefulset_status_current_revision|kube_deployment_status_observed_generation|kubelet_volume_stats_inodes|kubelet_volume_stats_available_bytes|kube_deployment_status_replicas_available|container_network_transmit_packets_total|kubelet_pleg_relist_duration_seconds_bucket|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|container_cpu_cfs_periods_total|process_cpu_seconds_total|kubelet_pleg_relist_interval_seconds_bucket|kube_horizontalpodautoscaler_spec_min_replicas|kube_statefulset_status_replicas|kube_deployment_spec_replicas|namespace_workload_pod:kube_pod_owner:relabel|kube_pod_owner|node_namespace_pod_container:container_memory_working_set_bytes|kubelet_node_name|kube_job_failed|container_memory_working_set_bytes|kubelet_runtime_operations_duration_seconds_bucket|namespace_memory:kube_pod_container_resource_requests:sum|kube_node_info|kubernetes_build_info|kube_statefulset_status_update_revision|kube_pod_container_resource_limits|kubelet_running_pods|kube_statefulset_replicas|kube_namespace_created|node_namespace_pod_container:container_memory_rss|node_namespace_pod_container:container_memory_cache|kube_node_status_condition|kube_pod_container_resource_requests|kubelet_running_pod_count|namespace_memory:kube_pod_container_resource_limits:sum|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_pod_container_status_waiting_reason|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_running_container_count|node_namespace_pod_container:container_memory_swap|kube_deployment_status_replicas_updated|process_resident_memory_bytes|rest_client_requests_total|kube_node_status_allocatable|kube_statefulset_metadata_generation|kube_deployment_metadata_generation|container_network_transmit_packets_dropped_total|kube_pod_info|kubelet_cgroup_manager_duration_seconds_count|container_fs_reads_total|kube_daemonset_status_desired_number_scheduled|container_cpu_cfs_throttled_periods_total|kube_job_status_succeeded|kubelet_certificate_manager_client_ttl_seconds|kube_replicaset_owner|go_goroutines|namespace_cpu:kube_pod_container_resource_requests:sum
|
||||
action: keep
|
||||
relabel_configs:
|
||||
- action: keep
|
||||
regex: ksm-kube-state-metrics
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_name
|
||||
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
job_name: 'integrations/node_exporter'
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_endpoints_name]
|
||||
regex: 'node-exporter'
|
||||
action: keep
|
||||
# relabel 'instance'
|
||||
- source_labels: [__meta_kubernetes_pod_node_name]
|
||||
action: replace
|
||||
target_label: instance
|
||||
|
||||
integrations:
|
||||
prometheus_remote_write:
|
||||
- url: https://prometheus-prod-01-eu-west-0.grafana.net/api/prom/push
|
||||
basic_auth:
|
||||
username: $USERNAME
|
||||
password: $APIKEY
|
||||
|
||||
# logs example
|
||||
# logs:
|
||||
# configs:
|
||||
# - name: integrations
|
||||
# clients:
|
||||
# - url: https://logs-prod-eu-west-0.grafana.net/api/prom/push
|
||||
# basic_auth:
|
||||
# username: 195593
|
||||
# password: $APIKEY
|
||||
# external_labels:
|
||||
# cluster: cloud
|
||||
# positions:
|
||||
# filename: /tmp/positions.yaml
|
||||
# target_config:
|
||||
# sync_period: 10s
|
|
@ -0,0 +1,83 @@
|
|||
# https://raw.githubusercontent.com/grafana/agent/v0.23.0/production/kubernetes/agent-bare.yaml
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: grafana-agent
|
||||
namespace: monitoring
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: grafana-agent
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- nodes
|
||||
- nodes/proxy
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- nonResourceURLs:
|
||||
- /metrics
|
||||
verbs:
|
||||
- get
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: grafana-agent
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: grafana-agent
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: grafana-agent
|
||||
namespace: monitoring
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: grafana-agent
|
||||
namespace: monitoring
|
||||
spec:
|
||||
minReadySeconds: 10
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 10
|
||||
selector:
|
||||
matchLabels:
|
||||
name: grafana-agent
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: grafana-agent
|
||||
spec:
|
||||
containers:
|
||||
- args:
|
||||
- -config.file=/etc/agent/agent.yaml
|
||||
command:
|
||||
- /bin/agent
|
||||
env:
|
||||
- name: HOSTNAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
image: grafana/agent:v0.23.0
|
||||
imagePullPolicy: IfNotPresent
|
||||
name: agent
|
||||
ports:
|
||||
- containerPort: 12345
|
||||
name: http-metrics
|
||||
volumeMounts:
|
||||
- mountPath: /etc/agent
|
||||
name: grafana-agent
|
||||
serviceAccount: grafana-agent
|
||||
volumes:
|
||||
- configMap:
|
||||
name: grafana-agent
|
||||
name: grafana-agent
|
|
@ -0,0 +1,96 @@
|
|||
# see https://devopscube.com/node-exporter-kubernetes/
|
||||
# and https://www.opsramp.com/prometheus-monitoring/prometheus-node-exporter/
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: exporter
|
||||
app.kubernetes.io/name: node-exporter
|
||||
name: node-exporter
|
||||
namespace: monitoring
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/component: exporter
|
||||
app.kubernetes.io/name: node-exporter
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: exporter
|
||||
app.kubernetes.io/name: node-exporter
|
||||
spec:
|
||||
containers:
|
||||
- args:
|
||||
- --path.sysfs=/host/sys
|
||||
- --path.rootfs=/host/root
|
||||
- --no-collector.wifi
|
||||
- --no-collector.hwmon
|
||||
- --no-collector.infiniband
|
||||
- --no-collector.filefd
|
||||
- --no-collector.ipvs
|
||||
- --no-collector.mdadm
|
||||
- --no-collector.netclass
|
||||
- --no-collector.netstat
|
||||
- --no-collector.nfsd
|
||||
- --no-collector.nvme
|
||||
- --no-collector.powersupplyclass
|
||||
- --no-collector.pressure
|
||||
- --no-collector.rapl
|
||||
- --no-collector.schedstat
|
||||
- --no-collector.sockstat
|
||||
- --no-collector.softnet
|
||||
- --no-collector.tapestats
|
||||
- --no-collector.thermal_zone
|
||||
- --no-collector.udp_queues
|
||||
- --no-collector.xfs
|
||||
- --no-collector.zfs
|
||||
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
|
||||
- --collector.netclass.ignored-devices=^(veth.*)$
|
||||
name: node-exporter
|
||||
image: prom/node-exporter
|
||||
ports:
|
||||
- containerPort: 9100
|
||||
protocol: TCP
|
||||
resources:
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 180Mi
|
||||
requests:
|
||||
cpu: 102m
|
||||
memory: 180Mi
|
||||
volumeMounts:
|
||||
- mountPath: /host/sys
|
||||
mountPropagation: HostToContainer
|
||||
name: sys
|
||||
readOnly: true
|
||||
- mountPath: /host/root
|
||||
mountPropagation: HostToContainer
|
||||
name: root
|
||||
readOnly: true
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /sys
|
||||
name: sys
|
||||
- hostPath:
|
||||
path: /
|
||||
name: root
|
||||
---
|
||||
kind: Service
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: node-exporter
|
||||
namespace: monitoring
|
||||
labels:
|
||||
instance: primary
|
||||
annotations:
|
||||
prometheus.io/scrape: 'true'
|
||||
prometheus.io/port: '9100'
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/component: exporter
|
||||
app.kubernetes.io/name: node-exporter
|
||||
ports:
|
||||
- name: node-exporter
|
||||
protocol: TCP
|
||||
port: 9100
|
||||
targetPort: 9100
|
|
@ -0,0 +1,22 @@
|
|||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: Prometheus
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: prometheus
|
||||
spec:
|
||||
image: quay.io/prometheus/prometheus:v2.22.1
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
replicas: 2
|
||||
resources:
|
||||
requests:
|
||||
memory: 400Mi
|
||||
securityContext:
|
||||
fsGroup: 2000
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
serviceAccountName: prometheus
|
||||
version: v2.22.1
|
||||
serviceMonitorSelector: {}
|
|
@ -0,0 +1,42 @@
|
|||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: prometheus
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: prometheus
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- nodes
|
||||
- nodes/metrics
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- configmaps
|
||||
verbs: ["get"]
|
||||
- apiGroups:
|
||||
- networking.k8s.io
|
||||
resources:
|
||||
- ingresses
|
||||
verbs: ["get", "list", "watch"]
|
||||
- nonResourceURLs: ["/metrics"]
|
||||
verbs: ["get"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: prometheus
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: prometheus
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus
|
||||
namespace: monitoring
|
|
@ -0,0 +1,15 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: prometheus
|
||||
spec:
|
||||
ports:
|
||||
- name: web
|
||||
port: 9090
|
||||
targetPort: web
|
||||
selector:
|
||||
app: prometheus
|
||||
sessionAffinity: ClientIP
|
|
@ -0,0 +1,14 @@
|
|||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: prometheus-self
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: prometheus
|
||||
spec:
|
||||
endpoints:
|
||||
- interval: 30s
|
||||
port: web
|
||||
selector:
|
||||
matchLabels:
|
||||
app: prometheus
|
Loading…
Reference in a new issue