From 8a3fc9399ec2bf4590e3818fd85f7f3a7c964155 Mon Sep 17 00:00:00 2001 From: Pat Dyn Date: Fri, 23 Dec 2022 14:52:43 +0000 Subject: [PATCH] We now produce a working monitoring config --- doc/Architecture.md | 6 +- doc/Monitoring.md | 23 +++ project.clj | 16 ++- src/main/clj/dda/c4k_common/uberjar.clj | 14 +- src/main/cljc/dda/c4k_common/common.cljc | 6 +- src/main/cljc/dda/c4k_common/core.cljc | 21 +++ src/main/cljc/dda/c4k_common/monitoring.cljc | 131 ++++++++++++++++++ src/main/cljc/dda/c4k_common/predicate.cljc | 4 + .../cluster-role-binding.yaml | 17 +++ .../kube-state-metrics/cluster-role.yaml | 128 +++++++++++++++++ .../kube-state-metrics/deployment.yaml | 53 +++++++ .../kube-state-metrics/service-account.yaml | 10 ++ .../kube-state-metrics/service.yaml | 20 +++ src/main/resources/monitoring/namespace.yaml | 6 + .../node-exporter/cluster-role-binding.yaml | 14 ++ .../node-exporter/cluster-role.yaml | 45 ++++++ .../monitoring/node-exporter/daemon-set.yaml | 71 ++++++++++ .../node-exporter/service-account.yaml | 7 + .../monitoring/node-exporter/service.yaml | 18 +++ .../prometheus/cluster-role-binding.yaml | 14 ++ .../monitoring/prometheus/cluster-role.yaml | 37 +++++ .../monitoring/prometheus/config.yaml | 8 ++ .../monitoring/prometheus/deployment.yaml | 42 ++++++ .../monitoring/prometheus/prometheus.yaml | 64 +++++++++ .../prometheus/service-account.yaml | 8 ++ .../monitoring/prometheus/service.yaml | 18 +++ .../clj/dda/c4k_common/common_spec_test.cljc | 16 +++ .../dda/c4k_common/monitoring_regex_test.clj | 24 ++++ src/test/cljc/dda/c4k_common/common_test.cljc | 2 +- .../cljc/dda/c4k_common/monitoring_test.cljc | 47 +++++++ .../should_filter_metrik.edn | 19 +++ src/test/resources/metrics_list.txt | 30 ++++ 32 files changed, 928 insertions(+), 11 deletions(-) create mode 100644 doc/Monitoring.md create mode 100644 src/main/cljc/dda/c4k_common/core.cljc create mode 100644 src/main/cljc/dda/c4k_common/monitoring.cljc create mode 100644 src/main/resources/monitoring/kube-state-metrics/cluster-role-binding.yaml create mode 100644 src/main/resources/monitoring/kube-state-metrics/cluster-role.yaml create mode 100644 src/main/resources/monitoring/kube-state-metrics/deployment.yaml create mode 100644 src/main/resources/monitoring/kube-state-metrics/service-account.yaml create mode 100644 src/main/resources/monitoring/kube-state-metrics/service.yaml create mode 100644 src/main/resources/monitoring/namespace.yaml create mode 100644 src/main/resources/monitoring/node-exporter/cluster-role-binding.yaml create mode 100644 src/main/resources/monitoring/node-exporter/cluster-role.yaml create mode 100644 src/main/resources/monitoring/node-exporter/daemon-set.yaml create mode 100644 src/main/resources/monitoring/node-exporter/service-account.yaml create mode 100644 src/main/resources/monitoring/node-exporter/service.yaml create mode 100644 src/main/resources/monitoring/prometheus/cluster-role-binding.yaml create mode 100644 src/main/resources/monitoring/prometheus/cluster-role.yaml create mode 100644 src/main/resources/monitoring/prometheus/config.yaml create mode 100644 src/main/resources/monitoring/prometheus/deployment.yaml create mode 100644 src/main/resources/monitoring/prometheus/prometheus.yaml create mode 100644 src/main/resources/monitoring/prometheus/service-account.yaml create mode 100644 src/main/resources/monitoring/prometheus/service.yaml create mode 100644 src/test/clj/dda/c4k_common/common_spec_test.cljc create mode 100644 src/test/clj/dda/c4k_common/monitoring_regex_test.clj create mode 100644 src/test/cljc/dda/c4k_common/monitoring_test.cljc create mode 100644 src/test/resources/dda/c4k_common/monitoring_regex_test/should_filter_metrik.edn create mode 100644 src/test/resources/metrics_list.txt diff --git a/doc/Architecture.md b/doc/Architecture.md index 27ce619..cb8d505 100644 --- a/doc/Architecture.md +++ b/doc/Architecture.md @@ -14,7 +14,7 @@ C4Context Container_Boundary(k3s, "K3S") { Component(lb, "metallb") Component(api, "K8s API") - Component(grafana-agent, "Grfana Agent") + Component(prometheus, "Prometheus in proxy mode") Container_Boundary(app, "Application") { Component(app, "App-container") Component(app-backup, "backup & restore-container using restic") @@ -41,8 +41,8 @@ C4Context Rel(app, app-file-storage, "file") Rel(app, app-db-storage, "*dbc") - Rel(grafana-agent, api, "http") - Rel(grafana-agent, grafana, "http") + Rel(prometheus, api, "http") + Rel(prometheus, grafana, "http") Rel(app-backup, backup, "s3") Rel(app-backup, app-file-storage, "file") diff --git a/doc/Monitoring.md b/doc/Monitoring.md new file mode 100644 index 0000000..8fdb105 --- /dev/null +++ b/doc/Monitoring.md @@ -0,0 +1,23 @@ +# Runtime View + +```mermaid +C4Context + title Runtime + Enterprise_Boundary(b0, "Infrastructure") { + System(grafana, "Grafana Cloud", "Monitoring your apps") + + Container_Boundary(srv, "Small Server") { + Container_Boundary(k3s, "K3S") { + Component(api, "K8s API") + Container(prometheus, "Prometheus in proxy mode") + Container(node-exporter, "Node-Exporter Daemon Set") + Container_Boundary(app, "Application") { + Container(app, "App-container") + } + } + } + } + + Rel(prometheus, api, "rest") + Rel(prometheus, grafana, "rest") +``` \ No newline at end of file diff --git a/project.clj b/project.clj index dfe2c4b..b621c35 100644 --- a/project.clj +++ b/project.clj @@ -9,6 +9,7 @@ [orchestra "2021.01.01-1"] [expound "0.9.0"] [clj-commons/clj-yaml "0.7.108"]] + :target-path "target/%s/" :source-paths ["src/main/cljc" "src/main/clj"] :resource-paths ["src/main/resources"] @@ -20,10 +21,21 @@ "src/test/cljc"] :resource-paths ["src/test/resources"] :dependencies [[dda/data-test "0.1.1"]]} - :dev {:plugins [[lein-shell "0.5.0"]]}} + :dev {:plugins [[lein-shell "0.5.0"]]} + :uberjar {:aot :all + :main dda.c4k-common.uberjar + :uberjar-name "c4k-common-standalone.jar" + :dependencies [[org.clojure/tools.cli "1.0.206"] + [ch.qos.logback/logback-classic "1.3.0-alpha4" + :exclusions [com.sun.mail/javax.mail]] + [org.slf4j/jcl-over-slf4j "2.0.0-alpha1"]]}} :release-tasks [["test"] ["vcs" "assert-committed"] ["change" "version" "leiningen.release/bump-version" "release"] ["vcs" "commit"] ["vcs" "tag" "v" "--no-sign"] - ["change" "version" "leiningen.release/bump-version"]]) + ["change" "version" "leiningen.release/bump-version"]] + :aliases {"inst" ["shell" + "sh" + "-c" + "lein uberjar && sudo install -m=755 target/uberjar/c4k-common-standalone.jar /usr/local/bin/c4k-common-standalone.jar"]}) diff --git a/src/main/clj/dda/c4k_common/uberjar.clj b/src/main/clj/dda/c4k_common/uberjar.clj index 499c50f..44ec7b7 100644 --- a/src/main/clj/dda/c4k_common/uberjar.clj +++ b/src/main/clj/dda/c4k_common/uberjar.clj @@ -1,16 +1,18 @@ (ns dda.c4k-common.uberjar + (:gen-class) (:require [clojure.spec.alpha :as s] [clojure.string :as cs] [clojure.tools.reader.edn :as edn] [dda.c4k-common.common :as cm] + [dda.c4k-common.core :as core] [expound.alpha :as expound])) (defn usage [name] (str "usage: - " name "{your configuraton file} {your authorization file}")) + " name " {your configuraton file} {your authorization file}")) (s/def ::options (s/* #{"-h"})) (s/def ::filename (s/and string? @@ -49,4 +51,12 @@ (expound/expound-str config-spec? config-edn {:print-specs? false}))) (when (not auth-valid?) (println - (expound/expound-str auth-spec? auth-edn {:print-specs? false}))))))))))) \ No newline at end of file + (expound/expound-str auth-spec? auth-edn {:print-specs? false}))))))))))) + +(defn -main [& cmd-args] + (main-common "c4k-common" + core/config? + core/auth? + core/config-defaults + core/k8s-objects + cmd-args)) \ No newline at end of file diff --git a/src/main/cljc/dda/c4k_common/common.cljc b/src/main/cljc/dda/c4k_common/common.cljc index 8bb53d9..4bf94ff 100644 --- a/src/main/cljc/dda/c4k_common/common.cljc +++ b/src/main/cljc/dda/c4k_common/common.cljc @@ -44,7 +44,7 @@ coll)) (defn-spec replace-all-matching-values-by-new-value cp/map-or-seq? - [coll string? + [coll cp/map-or-seq? value-to-match string? value-to-replace string?] (clojure.walk/postwalk #(if (and (= (type value-to-match) (type %)) @@ -59,7 +59,7 @@ (apply concat vs))) (defn generate-common [my-config my-auth config-defaults k8s-objects] - (let [resulting-config (merge config-defaults my-config my-auth)] + (let [resulting-config (merge config-defaults my-config)] (cs/join "\n---\n" - (k8s-objects resulting-config)))) \ No newline at end of file + (k8s-objects resulting-config my-auth)))) diff --git a/src/main/cljc/dda/c4k_common/core.cljc b/src/main/cljc/dda/c4k_common/core.cljc new file mode 100644 index 0000000..98e78eb --- /dev/null +++ b/src/main/cljc/dda/c4k_common/core.cljc @@ -0,0 +1,21 @@ +(ns dda.c4k-common.core + (:require + [clojure.spec.alpha :as s] + [dda.c4k-common.yaml :as yaml] + [dda.c4k-common.common :as cm] + [dda.c4k-common.monitoring :as monitoring])) + +(def config-defaults {}) + +(def config? (s/keys :req-un [] + :opt-un [])) + +(def auth? (s/keys :req-un [] + :opt-un [])) + +(defn k8s-objects [config auth] + (let [] + (map yaml/to-string + (filter #(not (nil? %)) + (cm/concat-vec + (monitoring/generate config auth)))))) \ No newline at end of file diff --git a/src/main/cljc/dda/c4k_common/monitoring.cljc b/src/main/cljc/dda/c4k_common/monitoring.cljc new file mode 100644 index 0000000..dbffb3d --- /dev/null +++ b/src/main/cljc/dda/c4k_common/monitoring.cljc @@ -0,0 +1,131 @@ +(ns dda.c4k-common.monitoring + (:require + [clojure.spec.alpha :as s] + #?(:cljs [shadow.resource :as rc]) + #?(:clj [orchestra.core :refer [defn-spec]] + :cljs [orchestra.core :refer-macros [defn-spec]]) + [dda.c4k-common.yaml :as yaml] + [dda.c4k-common.predicate :as cp] + [dda.c4k-common.common :as cm] + [clojure.string :as str])) + +(s/def ::grafana-cloud-user cp/bash-env-string?) +(s/def ::grafana-cloud-password cp/bash-env-string?) +(s/def ::grafana-cloud-url cp/fqdn-string?) +(s/def ::k3s-cluster-name cp/bash-env-string?) +(s/def ::k3s-cluster-stage cp/stage?) +(s/def ::pvc-storage-class-name cp/pvc-storage-class-name?) +(s/def ::node-regex string?) +(s/def ::traefik-regex string?) +(s/def ::kube-state-regex string?) + +;; TODO: rename to monitoring +(defn grafana-config? [input] + (s/keys :req-un [::grafana-cloud-url ::k3s-cluster-name ::k3s-cluster-stage])) + +(defn grafana-auth? [input] + (s/keys :req-un [::grafana-cloud-user ::grafana-cloud-password])) + +(defn grafana-provider? [input] + (s/keys :opt-un [::pvc-storage-class-name])) + +(defn filter-regex? [input] + (s/keys :req-un [::node-regex ::traefik-regex ::kube-state-regex])) + +(def metric-regex {:node-regex + (str "node_cpu_sec.+|node_load[0-9]+|node_memory_Buf.*|node_memory_Mem.*|" + "node_memory_Cached.*|node_disk_[r,w,i].*|node_filesystem_[s,a].*|" + "node_network_receive_bytes_total|node_network_transmit_bytes_total") + :traefik-regex (str "traefik_entrypoint_.*_total|" + "traefik_entrypoint_.*_seconds_count|" + "traefik_router_.*_total|" + "traefik_router_.*_seconds_count|" + "traefik_service_.*_total|" + "traefik_service_.*_seconds_count|" + "traefik_tls_certs_not_after") + :kube-state-regex (str "kube_pod_container_status_restarts_total|" + "kube_pod_status_reason|kube_node_status_capacity|kube_node_status_allocatable|" + "kube_cronjob_status_active|kube_job_status_failed")}) + +(def filter-regex-string + (str/join "|" (vals metric-regex))) + +#?(:cljs + (defmethod yaml/load-resource :monitoring [resource-name] + (case resource-name + "monitoring/namespace.yaml" (rc/inline "monitoring/namespace.yaml") + "monitoring/prometheus/config.yaml" (rc/inline "monitoring/prometheus/config.yaml") + "monitoring/prometheus/cluster-role.yaml" (rc/inline "monitoring/prometheus/cluster-role.yaml") + "monitoring/prometheus/cluster-role-binding.yaml" (rc/inline "monitoring/prometheus/cluster-role-binding.yaml") + "monitoring/prometheus/deployment.yaml" (rc/inline "monitoring/prometheus/deployment.yaml") + "monitoring/prometheus/prometheus.yaml" (rc/inline "monitoring/prometheus/prometheus.yaml") + "monitoring/prometheus/service.yaml" (rc/inline "monitoring/prometheus/service.yaml") + "monitoring/prometheus/service-account.yaml" (rc/inline "monitoring/prometheus/service-account.yaml") + "monitoring/node-exporter/daemon-set.yaml" (rc/inline "monitoring/node-exporter/daemon-set.yaml") + "monitoring/node-exporter/service.yaml" (rc/inline "monitoring/node-exporter/service.yaml") + "monitoring/node-exporter/cluster-role-binding.yaml" (rc/inline "monitoring/node-exporter/cluster-role-binding.yaml") + "monitoring/node-exporter/cluster-role.yaml" (rc/inline "monitoring/node-exporter/cluster-role.yaml") + "monitoring/node-exporter/service-account.yaml" (rc/inline "monitoring/node-exporter/service-account.yaml") + "monitoring/kube-state-metrics/cluster-role-binding.yaml" (rc/inline "monitoring/kube-state-metrics/cluster-role-binding.yaml") + "monitoring/kube-state-metrics/cluster-role.yaml" (rc/inline "monitoring/kube-state-metrics/cluster-role.yaml") + "monitoring/kube-state-metrics/deployment.yaml" (rc/inline "monitoring/kube-state-metrics/deployment.yaml") + "monitoring/kube-state-metrics/service-account.yaml" (rc/inline "monitoring/kube-state-metrics/service-account.yaml") + "monitoring/kube-state-metrics/service.yaml" (rc/inline "monitoring/kube-state-metrics/service.yaml") + (throw (js/Error. "Undefined Resource!"))))) + +(defn-spec generate-stateful-set cp/map-or-seq? + [config grafana-provider?] + (let [{:keys [pvc-storage-class-name] + :or {pvc-storage-class-name :manual}} config] + (-> + (yaml/from-string (yaml/load-resource "monitoring/stateful-set.yaml")) + (assoc-in [:spec :volumeClaimTemplates 0 :spec :storageClassName] (name pvc-storage-class-name))))) + +(defn-spec generate-prometheus-config cp/map-or-seq? + [config grafana-config? + auth grafana-auth?] + (let [{:keys [grafana-cloud-url k3s-cluster-name k3s-cluster-stage]} config + {:keys [grafana-cloud-user grafana-cloud-password]} auth] + (-> + (yaml/from-string (yaml/load-resource "monitoring/prometheus/prometheus.yaml")) + (assoc-in [:global :external_labels :cluster] + k3s-cluster-name) + (assoc-in [:global :external_labels :stage] + k3s-cluster-stage) + (assoc-in [:remote_write 0 :url] + grafana-cloud-url) + (assoc-in [:remote_write 0 :basic_auth :username] + grafana-cloud-user) + (assoc-in [:remote_write 0 :basic_auth :password] + grafana-cloud-password) + (cm/replace-all-matching-values-by-new-value "FILTER_REGEX" filter-regex-string)))) + +(defn-spec generate-config cp/map-or-seq? + [config grafana-config? + auth grafana-auth?] + (-> + (yaml/from-string (yaml/load-resource "monitoring/prometheus/config.yaml")) + (assoc-in [:stringData :prometheus.yaml] + (yaml/to-string + (generate-prometheus-config config auth))))) + +(defn-spec generate cp/map-or-seq? + [config grafana-config? + auth grafana-auth?] + [(yaml/from-string (yaml/load-resource "monitoring/namespace.yaml")) + (yaml/from-string (yaml/load-resource "monitoring/prometheus/cluster-role.yaml")) + (yaml/from-string (yaml/load-resource "monitoring/prometheus/cluster-role-binding.yaml")) + (yaml/from-string (yaml/load-resource "monitoring/prometheus/service.yaml")) + (yaml/from-string (yaml/load-resource "monitoring/prometheus/service-account.yaml")) + (generate-config config auth) + (yaml/from-string (yaml/load-resource "monitoring/prometheus/deployment.yaml")) + (yaml/from-string (yaml/load-resource "monitoring/node-exporter/service-account.yaml")) + (yaml/from-string (yaml/load-resource "monitoring/node-exporter/cluster-role.yaml")) + (yaml/from-string (yaml/load-resource "monitoring/node-exporter/cluster-role-binding.yaml")) + (yaml/from-string (yaml/load-resource "monitoring/node-exporter/daemon-set.yaml")) + (yaml/from-string (yaml/load-resource "monitoring/node-exporter/service.yaml")) + (yaml/from-string (yaml/load-resource "monitoring/kube-state-metrics/cluster-role-binding.yaml")) + (yaml/from-string (yaml/load-resource "monitoring/kube-state-metrics/cluster-role.yaml")) + (yaml/from-string (yaml/load-resource "monitoring/kube-state-metrics/deployment.yaml")) + (yaml/from-string (yaml/load-resource "monitoring/kube-state-metrics/service-account.yaml")) + (yaml/from-string (yaml/load-resource "monitoring/kube-state-metrics/service.yaml"))]) diff --git a/src/main/cljc/dda/c4k_common/predicate.cljc b/src/main/cljc/dda/c4k_common/predicate.cljc index da55f18..9b955ad 100644 --- a/src/main/cljc/dda/c4k_common/predicate.cljc +++ b/src/main/cljc/dda/c4k_common/predicate.cljc @@ -22,6 +22,10 @@ [input] (contains? #{"prod" "staging"} input)) +(defn stage? + [input] + (contains? #{:prod :acc :int :test :dev} input)) + (defn map-or-seq? [input] (or (map? input) (seq? input))) diff --git a/src/main/resources/monitoring/kube-state-metrics/cluster-role-binding.yaml b/src/main/resources/monitoring/kube-state-metrics/cluster-role-binding.yaml new file mode 100644 index 0000000..0fc2f12 --- /dev/null +++ b/src/main/resources/monitoring/kube-state-metrics/cluster-role-binding.yaml @@ -0,0 +1,17 @@ +# from https://github.com/kubernetes/kube-state-metrics/tree/main/examples/standard +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/version: 2.7.0 + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: monitoring diff --git a/src/main/resources/monitoring/kube-state-metrics/cluster-role.yaml b/src/main/resources/monitoring/kube-state-metrics/cluster-role.yaml new file mode 100644 index 0000000..e20db08 --- /dev/null +++ b/src/main/resources/monitoring/kube-state-metrics/cluster-role.yaml @@ -0,0 +1,128 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/version: 2.7.0 + name: kube-state-metrics +rules: +- apiGroups: + - "" + resources: + - configmaps + - secrets + - nodes + - pods + - services + - serviceaccounts + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: + - list + - watch +- apiGroups: + - apps + resources: + - statefulsets + - daemonsets + - deployments + - replicasets + verbs: + - list + - watch +- apiGroups: + - batch + resources: + - cronjobs + - jobs + verbs: + - list + - watch +- apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - list + - watch +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +- apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - list + - watch +- apiGroups: + - certificates.k8s.io + resources: + - certificatesigningrequests + verbs: + - list + - watch +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - list + - watch +- apiGroups: + - storage.k8s.io + resources: + - storageclasses + - volumeattachments + verbs: + - list + - watch +- apiGroups: + - admissionregistration.k8s.io + resources: + - mutatingwebhookconfigurations + - validatingwebhookconfigurations + verbs: + - list + - watch +- apiGroups: + - networking.k8s.io + resources: + - networkpolicies + - ingressclasses + - ingresses + verbs: + - list + - watch +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - list + - watch +- apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - clusterroles + - rolebindings + - roles + verbs: + - list + - watch diff --git a/src/main/resources/monitoring/kube-state-metrics/deployment.yaml b/src/main/resources/monitoring/kube-state-metrics/deployment.yaml new file mode 100644 index 0000000..6c1b543 --- /dev/null +++ b/src/main/resources/monitoring/kube-state-metrics/deployment.yaml @@ -0,0 +1,53 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/version: 2.7.0 + name: kube-state-metrics + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: kube-state-metrics + template: + metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/version: 2.7.0 + spec: + serviceAccountName: kube-state-metrics + automountServiceAccountToken: true + containers: + - name: kube-state-metrics + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.7.0 + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 8081 + name: telemetry + readinessProbe: + httpGet: + path: / + port: 8081 + initialDelaySeconds: 5 + timeoutSeconds: 5 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsUser: 65534 + nodeSelector: + kubernetes.io/os: linux + diff --git a/src/main/resources/monitoring/kube-state-metrics/service-account.yaml b/src/main/resources/monitoring/kube-state-metrics/service-account.yaml new file mode 100644 index 0000000..2a70683 --- /dev/null +++ b/src/main/resources/monitoring/kube-state-metrics/service-account.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +automountServiceAccountToken: false +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/version: 2.7.0 + name: kube-state-metrics + namespace: monitoring diff --git a/src/main/resources/monitoring/kube-state-metrics/service.yaml b/src/main/resources/monitoring/kube-state-metrics/service.yaml new file mode 100644 index 0000000..f71ce26 --- /dev/null +++ b/src/main/resources/monitoring/kube-state-metrics/service.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/version: 2.7.0 + name: kube-state-metrics + namespace: monitoring +spec: + clusterIP: None + ports: + - name: http-metrics + port: 8080 + targetPort: http-metrics + - name: telemetry + port: 8081 + targetPort: telemetry + selector: + app.kubernetes.io/name: kube-state-metrics diff --git a/src/main/resources/monitoring/namespace.yaml b/src/main/resources/monitoring/namespace.yaml new file mode 100644 index 0000000..add5391 --- /dev/null +++ b/src/main/resources/monitoring/namespace.yaml @@ -0,0 +1,6 @@ +kind: Namespace +apiVersion: v1 +metadata: + name: monitoring + labels: + name: monitoring \ No newline at end of file diff --git a/src/main/resources/monitoring/node-exporter/cluster-role-binding.yaml b/src/main/resources/monitoring/node-exporter/cluster-role-binding.yaml new file mode 100644 index 0000000..d4ac74a --- /dev/null +++ b/src/main/resources/monitoring/node-exporter/cluster-role-binding.yaml @@ -0,0 +1,14 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + name: node-exporter + name: node-exporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: node-exporter +subjects: +- kind: ServiceAccount + name: node-exporter + namespace: monitoring \ No newline at end of file diff --git a/src/main/resources/monitoring/node-exporter/cluster-role.yaml b/src/main/resources/monitoring/node-exporter/cluster-role.yaml new file mode 100644 index 0000000..42b553e --- /dev/null +++ b/src/main/resources/monitoring/node-exporter/cluster-role.yaml @@ -0,0 +1,45 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + name: node-exporter + name: node-exporter +rules: +- apiGroups: + - "" + resources: + - nodes + - nodes/proxy + - nodes/metrics + - services + - endpoints + - pods + - ingresses + - configmaps + - events + verbs: + - get + - list + - watch +- apiGroups: + - "extensions" + - "networking.k8s.io" + resources: + - ingresses/status + - ingresses + verbs: + - get + - list + - watch +- apiGroups: + - extensions + resources: + - podsecuritypolicies + verbs: + - use + resourceNames: + - node-exporter +- nonResourceURLs: + - /metrics + verbs: + - get diff --git a/src/main/resources/monitoring/node-exporter/daemon-set.yaml b/src/main/resources/monitoring/node-exporter/daemon-set.yaml new file mode 100644 index 0000000..3fdd9fe --- /dev/null +++ b/src/main/resources/monitoring/node-exporter/daemon-set.yaml @@ -0,0 +1,71 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app.kubernetes.io/name: node-exporter + name: node-exporter + namespace: monitoring +spec: + selector: + matchLabels: + app.kubernetes.io/name: node-exporter + template: + metadata: + labels: + app.kubernetes.io/name: node-exporter + spec: + serviceAccountName: node-exporter + containers: + - name: node-exporter + image: prom/node-exporter + imagePullPolicy: "IfNotPresent" + args: + - --path.sysfs=/host/sys + - --path.rootfs=/host/root + - --no-collector.conntrack + - --no-collector.wifi + - --no-collector.hwmon + - --no-collector.infiniband + - --no-collector.filefd + - --no-collector.mdadm + - --no-collector.netclass + - --no-collector.nfs + - --no-collector.nfsd + - --no-collector.powersupplyclass + - --no-collector.pressure + - --no-collector.rapl + - --no-collector.schedstat + - --no-collector.sockstat + - --no-collector.softnet + - --no-collector.tapestats + - --no-collector.thermal_zone + - --no-collector.xfs + - --no-collector.zfs + - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) + - --collector.netclass.ignored-devices=^(veth.*)$ + ports: + - containerPort: 9100 + protocol: TCP + resources: + limits: + cpu: 250m + memory: 180Mi + requests: + cpu: 102m + memory: 180Mi + volumeMounts: + - mountPath: /host/sys + mountPropagation: HostToContainer + name: sys + readOnly: true + - mountPath: /host/root + mountPropagation: HostToContainer + name: root + readOnly: true + volumes: + - hostPath: + path: /sys + name: sys + - hostPath: + path: / + name: root diff --git a/src/main/resources/monitoring/node-exporter/service-account.yaml b/src/main/resources/monitoring/node-exporter/service-account.yaml new file mode 100644 index 0000000..21127eb --- /dev/null +++ b/src/main/resources/monitoring/node-exporter/service-account.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + name: node-exporter + name: node-exporter + namespace: monitoring diff --git a/src/main/resources/monitoring/node-exporter/service.yaml b/src/main/resources/monitoring/node-exporter/service.yaml new file mode 100644 index 0000000..003ba95 --- /dev/null +++ b/src/main/resources/monitoring/node-exporter/service.yaml @@ -0,0 +1,18 @@ +kind: Service +apiVersion: v1 +metadata: + name: node-exporter + namespace: monitoring + labels: + app.kubernetes.io/name: node-exporter + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '9100' +spec: + selector: + app.kubernetes.io/name: node-exporter + ports: + - name: node-exporter-http + protocol: TCP + port: 9100 + targetPort: 9100 \ No newline at end of file diff --git a/src/main/resources/monitoring/prometheus/cluster-role-binding.yaml b/src/main/resources/monitoring/prometheus/cluster-role-binding.yaml new file mode 100644 index 0000000..05779d9 --- /dev/null +++ b/src/main/resources/monitoring/prometheus/cluster-role-binding.yaml @@ -0,0 +1,14 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + name: prometheus + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount + name: prometheus + namespace: monitoring \ No newline at end of file diff --git a/src/main/resources/monitoring/prometheus/cluster-role.yaml b/src/main/resources/monitoring/prometheus/cluster-role.yaml new file mode 100644 index 0000000..d48c69a --- /dev/null +++ b/src/main/resources/monitoring/prometheus/cluster-role.yaml @@ -0,0 +1,37 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + name: prometheus + name: prometheus +rules: +- apiGroups: + - "" + resources: + - nodes + - nodes/proxy + - nodes/metrics + - services + - endpoints + - pods + - ingresses + - configmaps + - events + verbs: + - get + - list + - watch +- apiGroups: + - "extensions" + - "networking.k8s.io" + resources: + - ingresses/status + - ingresses + verbs: + - get + - list + - watch +- nonResourceURLs: + - /metrics + verbs: + - get diff --git a/src/main/resources/monitoring/prometheus/config.yaml b/src/main/resources/monitoring/prometheus/config.yaml new file mode 100644 index 0000000..e2592f7 --- /dev/null +++ b/src/main/resources/monitoring/prometheus/config.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: prometheus-conf + namespace: monitoring +type: Opaque +stringData: + prometheus.yaml: FILECONTENT diff --git a/src/main/resources/monitoring/prometheus/deployment.yaml b/src/main/resources/monitoring/prometheus/deployment.yaml new file mode 100644 index 0000000..f961a0d --- /dev/null +++ b/src/main/resources/monitoring/prometheus/deployment.yaml @@ -0,0 +1,42 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/name: prometheus + name: prometheus + namespace: monitoring +spec: + selector: + matchLabels: + app.kubernetes.io/name: prometheus + replicas: 1 + template: + metadata: + labels: + app.kubernetes.io/name: prometheus + spec: + serviceAccountName: prometheus + enableServiceLinks: true + containers: + - name: prometheus + image: "quay.io/prometheus/prometheus:v2.39.1" + imagePullPolicy: "IfNotPresent" + args: + - --config.file=/etc/prometheus/prometheus.yaml + - --storage.tsdb.path=/prometheus/ + - --storage.tsdb.retention.time=1d + ports: + - containerPort: 9090 + volumeMounts: + - name: prometheus-config-volume + mountPath: /etc/prometheus/ + readOnly: true + - name: prometheus-storage-volume + mountPath: /prometheus/ + volumes: + - name: prometheus-config-volume + secret: + secretName: prometheus-conf + defaultMode: 420 + - name: prometheus-storage-volume + emptyDir: {} diff --git a/src/main/resources/monitoring/prometheus/prometheus.yaml b/src/main/resources/monitoring/prometheus/prometheus.yaml new file mode 100644 index 0000000..2ffa360 --- /dev/null +++ b/src/main/resources/monitoring/prometheus/prometheus.yaml @@ -0,0 +1,64 @@ +global: + scrape_interval: 60s + evaluation_interval: 60s + external_labels: + cluster: $CLUSTERNAME + stage: $TEST_OR_PROD +remote_write: + - url: GRAFANA_CLOUD_URL + basic_auth: + username: GRAFANA_CLOUD_USER + password: GRAFANA_CLOUD_PASSWORD + write_relabel_configs: + - source_labels: + - __name__ + regex: FILTER_REGEX + action: keep +rule_files: + - /etc/prometheus/prometheus.rules +scrape_configs: + + - job_name: 'kubernetes-nodes' + kubernetes_sd_configs: + - role: node + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics + + - job_name: 'node-exporter' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_endpoints_name] + regex: 'node-exporter' + action: keep + + - job_name: 'traefik' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_endpoints_name] + regex: 'traefik' + action: keep + + - job_name: 'kube-state-metrics' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_endpoints_name] + regex: 'kube-state-metrics' + action: keep + + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] diff --git a/src/main/resources/monitoring/prometheus/service-account.yaml b/src/main/resources/monitoring/prometheus/service-account.yaml new file mode 100644 index 0000000..c7f35b5 --- /dev/null +++ b/src/main/resources/monitoring/prometheus/service-account.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ServiceAccount +automountServiceAccountToken: true +metadata: + labels: + name: prometheus + name: prometheus + namespace: monitoring diff --git a/src/main/resources/monitoring/prometheus/service.yaml b/src/main/resources/monitoring/prometheus/service.yaml new file mode 100644 index 0000000..c6b84b2 --- /dev/null +++ b/src/main/resources/monitoring/prometheus/service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: monitoring + labels: + app.kubernetes.io/name: prometheus + annotations: + metallb.universe.tf/address-pool: private +spec: + type: LoadBalancer + selector: + app.kubernetes.io/name: prometheus + ports: + - name: prometheus-http + protocol: TCP + port: 9000 + targetPort: 9090 diff --git a/src/test/clj/dda/c4k_common/common_spec_test.cljc b/src/test/clj/dda/c4k_common/common_spec_test.cljc new file mode 100644 index 0000000..7a086e7 --- /dev/null +++ b/src/test/clj/dda/c4k_common/common_spec_test.cljc @@ -0,0 +1,16 @@ +(ns dda.c4k-common.common-spec-test + (:require + #?(:clj [clojure.test :refer [deftest is are testing run-tests]] + :cljs [cljs.test :refer-macros [deftest is are testing run-tests]]) + [clojure.spec.test.alpha :as st] + [dda.c4k-common.common :as cut])) + +(deftest should-refuse-illegal-inputs + (is (thrown? Exception + (cut/concat-vec ["a1" "a2"] "b1"))) + (is (thrown? Exception + (cut/concat-vec ["a1" "a2"] nil))) + (is (thrown? Exception + (cut/concat-vec ["a1" "a2"] 2))) + (is (thrown? Exception + (cut/concat-vec {"a1" "a2"} [])))) \ No newline at end of file diff --git a/src/test/clj/dda/c4k_common/monitoring_regex_test.clj b/src/test/clj/dda/c4k_common/monitoring_regex_test.clj new file mode 100644 index 0000000..196a911 --- /dev/null +++ b/src/test/clj/dda/c4k_common/monitoring_regex_test.clj @@ -0,0 +1,24 @@ +(ns dda.c4k-common.monitoring-regex-test + (:require + [clojure.test :refer [deftest is are testing run-tests]] + [data-test :refer :all] + [dda.c4k-common.monitoring :as cut])) + +(defn filter-by-regex + [regex-str collection] + (filterv #(re-matches (re-pattern regex-str) %) + collection)) + +(defdatatest should-filter-metrik [input expected] + (is (= (:node-metrics expected) + (filter-by-regex + (:node-regex cut/metric-regex) + (into (:node-metrics expected) (:additional-node-metrics input))))) + (is (= (:traefik-metrics expected) + (filter-by-regex + (:traefik-regex cut/metric-regex) + (into (:traefik-metrics expected) (:additional-traefik-metrics input))))) + (is (= (:kube-state-metrics expected) + (filter-by-regex + (:kube-state-regex cut/metric-regex) + (into (:kube-state-metrics expected) (:additional-kube-state-metrics input)))))) diff --git a/src/test/cljc/dda/c4k_common/common_test.cljc b/src/test/cljc/dda/c4k_common/common_test.cljc index f662af5..7bdaeb5 100644 --- a/src/test/cljc/dda/c4k_common/common_test.cljc +++ b/src/test/cljc/dda/c4k_common/common_test.cljc @@ -24,4 +24,4 @@ (is (thrown? Exception (cut/concat-vec ["a1" "a2"] 2))) (is (thrown? Exception - (cut/concat-vec {"a1" "a2"} []))))) \ No newline at end of file + (cut/concat-vec {"a1" "a2"} []))))) diff --git a/src/test/cljc/dda/c4k_common/monitoring_test.cljc b/src/test/cljc/dda/c4k_common/monitoring_test.cljc new file mode 100644 index 0000000..4e733d4 --- /dev/null +++ b/src/test/cljc/dda/c4k_common/monitoring_test.cljc @@ -0,0 +1,47 @@ +(ns dda.c4k-common.monitoring-test + (:require + #?(:clj [clojure.test :refer [deftest is are testing run-tests]] + :cljs [cljs.test :refer-macros [deftest is are testing run-tests]]) + [clojure.string :as s] + [clojure.spec.test.alpha :as st] + [dda.c4k-common.monitoring :as cut] + [dda.c4k-common.yaml :as yaml] + [clojure.string :as str])) + +(st/instrument `cut/generate) +(st/instrument `cut/generate-stateful-set) +(st/instrument `cut/generate-agent-config) +(st/instrument `cut/generate-config) + +(def conf {:k3s-cluster-name "clustername" + :k3s-cluster-stage :test + :grafana-cloud-url "url"}) + +(def auth {:grafana-cloud-user "user" + :grafana-cloud-password "password" + :hetzner-cloud-ro-token "ro-token"}) + +(deftest should-generate + (is (= 17 + (count (cut/generate conf auth))))) + +(deftest should-generate-prometheus-remote-write-auth + (is (= {:username "user", + :password "password"} + (get-in + (cut/generate-prometheus-config conf auth) + [:remote_write 0 :basic_auth])))) + +(deftest should-generate-prometheus-external-labels + (is (= {:cluster "clustername", + :stage :test} + (get-in + (cut/generate-prometheus-config conf auth) + [:global :external_labels])))) + +(deftest should-generate-config + (is (s/starts-with? + (get-in + (cut/generate-config conf auth) + [:stringData :prometheus.yaml]) + "global:\n scrape_interval:"))) \ No newline at end of file diff --git a/src/test/resources/dda/c4k_common/monitoring_regex_test/should_filter_metrik.edn b/src/test/resources/dda/c4k_common/monitoring_regex_test/should_filter_metrik.edn new file mode 100644 index 0000000..789c340 --- /dev/null +++ b/src/test/resources/dda/c4k_common/monitoring_regex_test/should_filter_metrik.edn @@ -0,0 +1,19 @@ +{:input {:additional-node-metrics ["go_gc_duration_seconds"] + :additional-traefik-metrics ["traefik_config_reloads_total"] + :additional-kube-state-metrics ["kube_persistentvolume_annotations"]} +:expected +{:node-metrics + ["node_cpu_seconds_total" "node_load1" "node_load5" "node_load15" "node_memory_MemTotal_bytes" + "node_memory_MemFree_bytes" "node_memory_Buffers_bytes" "node_memory_Cached_bytes" + "node_disk_read_bytes_total" "node_disk_written_bytes_total" "node_disk_io_time_seconds_total" + "node_filesystem_size_bytes" "node_filesystem_avail_bytes" "node_network_receive_bytes_total" + "node_network_transmit_bytes_total"] + :traefik-metrics + ["traefik_entrypoint_requests_total" "traefik_entrypoint_requests_duration_seconds_count" + "traefik_entrypoint_requests_tls_total" "traefik_router_requests_total" + "traefik_router_requests_tls_total" + "traefik_service_requests_total" "traefik_service_requests_duration_seconds_count" + "traefik_service_requests_tls_total" "traefik_tls_certs_not_after"] + :kube-state-metrics + ["kube_pod_container_status_restarts_total" "kube_pod_status_reason" "kube_node_status_allocatable" + "kube_node_status_capacity" "kube_cronjob_status_active" "kube_job_status_failed"]}} \ No newline at end of file diff --git a/src/test/resources/metrics_list.txt b/src/test/resources/metrics_list.txt new file mode 100644 index 0000000..1cde61a --- /dev/null +++ b/src/test/resources/metrics_list.txt @@ -0,0 +1,30 @@ +node_cpu_seconds_total +node_load1 +node_load5 +node_load15 +node_memory_MemTotal_bytes +node_memory_MemFree_bytes +node_memory_Buffers_bytes +node_memory_Cached_bytes +node_disk_read_bytes_total +node_disk_written_bytes_total +node_disk_io_time_seconds_total +node_filesystem_size_bytes +node_filesystem_avail_bytes +node_network_receive_bytes_total +node_network_transmit_bytes_total +traefik_entrypoint_requests_total +traefik_entrypoint_requests_duration_seconds_count +traefik_entrypoint_requests_tls_total +traefik_router_requests_total +traefik_router_requests_tls_total +traefik_service_requests_total +traefik_service_requests_duration_seconds_count +traefik_service_requests_tls_total +traefik_tls_certs_not_after +kube_pod_container_status_restarts_total +kube_pod_status_reason +kube_node_status_allocatable +kube_node_status_capacity +kube_cronjob_status_active +kube_job_status_failed \ No newline at end of file