Monitoring
Requirements
- 2GB of RAM
- ~100GB of Disk
Prometheus
Server
mkdir -p /srv/monitoring/prometheus/{config,data}
/srv/monitoring/prometheus/config/prometheus.yml
global:
scrape_interval: 15s
external_labels:
monitor: "prometheus-monitoring"
alerting:
alertmanagers:
- static_configs:
- targets:
- "localhost:9093"
rule_files:
- /etc/alertmanager/alert.rules
scrape_configs:
- job_name: "Prometheus"
scrape_interval: 15s
static_configs:
- targets:
- "localhost:9090"
- job_name: "Dynamic"
file_sd_configs:
- files:
- targets.json
/srv/monitoring/prometheus/config/targets.json
[
{
"targets": ["192.168.2.55:9999"],
"labels": {
"alias": "gpu-miner",
"service": "GPU Miner nvidia-smi",
"hardware": "NVIDIA RTX 4070 Ti"
}
},
{
"targets": ["host.docker.internal:9999"],
"labels": {
"alias": "mina-snark",
"service": "Mina SNARK Fees",
"__scrape_interval__": "2m"
}
},
{
"targets": ["host.docker.internal:3395"],
"labels": {
"alias": "mina-libp2p",
"service": "Mina Block Producer, Snark, Uptime"
}
},
{
"targets": ["host.docker.internal:3396"],
"labels": {
"alias": "mina",
"service": "Mina Block Producer, Snark, Uptime"
}
},
{
"targets": ["host.docker.internal:1442"],
"labels": {
"alias": "Kupo",
"service": "Cardano Kupo",
"__metrics_path__": "/health"
}
},
{
"targets": ["host.docker.internal:1337"],
"labels": {
"alias": "Ogmios",
"service": "Cardano Ogmios"
}
},
{
"targets": ["host.docker.internal:9100"],
"labels": {
"alias": "Linux Exporter",
"service": "Linux Exporter"
}
},
{
"targets": ["host.docker.internal:12798"],
"labels": {
"alias": "Cardano Node",
"service": "Cardano Node"
}
}
]
mkdir -p /srv/monitoring/prometheus/{config,data}
docker run \
-d \
--restart always \
--name prometheus \
-p 9090:9090 \
--log-driver syslog --log-opt syslog-address="udp://localhost:5514" --log-opt syslog-format="rfc3164" --log-opt tag="prometheus" \
--add-host=host.docker.internal:host-gateway \
-v /srv/monitoring/prometheus/data:/prometheus \
-v "/srv/monitoring/prometheus/config/prometheus.yml:/etc/prometheus/prometheus.yml:ro" \
-v "/srv/monitoring/prometheus/config/targets.json:/etc/prometheus/targets.json:ro" \
-v /srv/monitoring/alertmanager/config:/etc/alertmanager:ro \
prom/prometheus:v2.46.0
Alert Manager
Server
mkdir -p /srv/monitoring/alertmanager/{config,data}
docker run \
-d \
--name alertmanager \
--restart always \
-p 9093:9093 \
--log-driver syslog --log-opt syslog-address="udp://localhost:5514" --log-opt syslog-format="rfc3164" --log-opt tag="alertmanager" \
--add-host=host.docker.internal:host-gateway \
-v "/srv/monitoring/alertmanager/config:/etc/alertmanager:ro" \
-v /srv/monitoring/alertmanager/data:/alertmanager \
prom/alertmanager:v0.26.0
/srv/monitoring/alertmanager/config/alertmanager.yml
global:
route:
repeat_interval: 3h
receiver: Discord
group_by:
- job
- alertname
- service
- alias
receivers:
- name: Discord
discord_configs:
- webhook_url: https://discord.com/api/webhooks/XXX/YYY
/srv/monitoring/alertmanager/config/alert.rules
groups:
- name: SystemUp
rules:
- alert: Node down
expr: up == 0
for: 3m
labels:
severity: critical
annotations:
title: Node {{ $labels.alias }} is down
description: Failed to scrape {{ $labels.job }} on {{ $labels.alias }} for more than 3 minutes. Node seems down.
- name: linuxSystem
rules:
- alert: CPU Usage
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{job='Dynamic',mode="idle"}[5m])) * 100) > 95
for: 3m
labels:
severity: critical
annotations:
summary: Machine Under Heavy Load
- alert: Low free space
expr: (node_filesystem_free_bytes{mountpoint = "/"} / node_filesystem_size_bytes{mountpoint = "/"} *100) < 15
for: 5m
labels:
severity: warning
annotations:
title: Low free space on {{ $labels.alias }}
description: On {{ $labels.alias }} device {{ $labels.device }} mounted on {{ $labels.mountpoint }} has low free space of {{ $value }}%
- alert: High Memory Load
expr: (((node_memory_MemTotal_bytes-node_memory_MemFree_bytes-node_memory_Cached_bytes)/(node_memory_MemTotal_bytes)*100)) > 90
for: 3m
labels:
severity: critical
annotations:
title: High Memory Load on {{ $labels.alias }}
Open Observe
Server
mkdir -p /srv/monitoring/openobserve/data
docker run \
-d \
--name openobserve \
--restart always \
--log-driver json-file --log-opt max-size="200k" --log-opt max-file="10" \
--add-host=host.docker.internal:host-gateway \
-v /srv/monitoring/openobserve/data:/data \
-p 5080:5080 \
-p 5514:5514 \
-p 5514:5514/udp \
-e ZO_DATA_DIR=/data \
-e [email protected] \
-e ZO_ROOT_USER_PASSWORD=YOUR_PASSWORD \
-e ZO_TELEMETRY=false \
-e ZO_PROMETHEUS_ENABLED=true \
-e ZO_COMPACT_DATA_RETENTION_DAYS=90 \
public.ecr.aws/zinclabs/openobserve:v0.5.2
See below for SELinux configuration
Grafana
Server
mkdir -p /srv/monitoring/grafana/
docker run \
-d \
--name grafana \
--restart always \
--log-driver syslog --log-opt syslog-address="udp://localhost:5514" --log-opt syslog-format="rfc3164" --log-opt tag="grafana" \
--add-host=host.docker.internal:host-gateway \
-p 3000:3000 \
-v /srv/monitoring/grafana/:/var/lib/grafana \
grafana/grafana-oss:10.1.1
Client
Register Linux Exporter
sudo useradd -rs /bin/false node_exporter
curl -L https://github.com/prometheus/node_exporter/releases/download/v1.6.1/node_exporter-1.6.1.linux-amd64.tar.gz | tar xvz -C /tmp
sudo cp /tmp/node_exporter-1.6.1.linux-amd64/node_exporter /usr/local/bin/
/etc/systemd/system/node_exporter.service
[Unit]
Description=Node Exporter
After=network.target
[Service]
User=node_exporter
Group=node_exporter
Type=simple
ExecStart=/usr/local/bin/node_exporter
[Install]
WantedBy=multi-user.target
sudo systemctl daemon-reload
sudo systemctl enable
sudo systemctl start
Note with SELinux:
To fix the SELinux issue:
sudo cat /var/log/messages
sealert -l 0603746b-9f37-4a15-9184-9078837c8d6d
sudo ausearch -c 'systemd' --raw | audit2allow -M my-systemd
sudo semodule -X 300 -i my-systemd.pp
sudo restorecon -v '/etc/systemd/system/node_exporter.service'
sudo systemctl enable --now node_exporter
sudo systemctl status node_exporter
rm my-systemd.*
Configure syslogd and redirect host logs to open observe
sudo dnf install -y rsyslog
```
`/etc/rsyslog.d/51-monitoring.conf`
```text
*.info;mail.none;authpriv.none;cron.none @localhost:5514
Replace localhost to the openobserve instance ip/dns
sudo systemctl daemon-reload
sudo systemctl enable
sudo systemctl start
SELinux
sudo ausearch -c 'rsyslogd' --raw | audit2allow -M my-rsyslogd
sudo semodule -i my-rsyslogd.pp
sudo /sbin/restorecon -v /etc/rsyslog.d/51-monitoring.conf
logger "Testing Open Observe Integration through syslog"
- TODO: Create alert in open observe to connect discord webhook
Files
- alertmanager.yml
- alert.rules
- prometheus.yml
- targets.json
- 51-monitoring.conf
- node_exporter.service
- docker-compose.yml
TL; DR;
sudo dnf install -y rsyslog
curl -O /content/devops/monitoring/monitoring/syslogd/51-monitoring.conf
sudo mv 51-monitoring.conf /etc/rsyslog.d/51-monitoring.conf
sudo useradd -rs /bin/false node_exporter
curl -L https://github.com/prometheus/node_exporter/releases/download/v1.6.1/node_exporter-1.6.1.linux-amd64.tar.gz | tar xvz -C /tmp
sudo cp /tmp/node_exporter-1.6.1.linux-amd64/node_exporter /usr/local/bin/
curl -O /content/devops/monitoring/monitoring/node_exporter/node_exporter.service
sudo mv node_exporter.service /etc/systemd/system/node_exporter.service
sudo chown root:root /etc/systemd/system/node_exporter.service
sudo chmod 644 /etc/systemd/system/node_exporter.service
sudo restorecon -rv /usr/local/bin/node_exporter
sudo mkdir -p /srv/monitoring
sudo chown -R 1000:1000 /srv/monitoring
cd /srv/monitoring
mkdir -p {alertmanager,prometheus}/{config,data}
pushd alertmanager/config
curl -O https://webuxlab.com/content/devops/monitoring/monitoring/alertmanager/config/alertmanager.yml
curl -O https://webuxlab.com/content/devops/monitoring/monitoring/alertmanager/config/alert.rules
popd
pushd prometheus/config
curl -O https://webuxlab.com/content/devops/monitoring/monitoring/prometheus/config/prometheus.yml
curl -O https://webuxlab.com/content/devops/monitoring/monitoring/prometheus/config/targets.json
popd
curl -O https://webuxlab.com/content/devops/monitoring/monitoring/docker-compose.yml
sudo systemctl daemon-reload
sudo systemctl enable --now rsyslog
sudo systemctl status rsyslog
sudo systemctl enable --now node_exporter
sudo systemctl status node_exporter
sed -i 's|https://discord.com/api/webhooks/XXX/YYY|___YOUR_WEBHOOK___|' alertmanager/config/alertmanager.yml
sudo mkdir grafana/
sudo chown 1000:1000 grafana
sudo mkdir -p openobserve/data/
sudo chown 1000:1000 openobserve/data/
sudo chown -R 999:999 alertmanager/data/
sudo chown -R 999:999 prometheus/data/
sudo chown -R 1000:1000 openobserve/
sudo chown -R 1000:1000 openobserve/data/
sudo chmod -R 777 openobserve/
docker compose up -d
logger "Testing Open Observe Integration through syslog"