5
									
								
								roles/prometheus/handlers/main.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								roles/prometheus/handlers/main.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,5 @@
 | 
			
		||||
---
 | 
			
		||||
- name: Restart Prometheus
 | 
			
		||||
  service:
 | 
			
		||||
    name: prometheus
 | 
			
		||||
    state: restarted
 | 
			
		||||
							
								
								
									
										42
									
								
								roles/prometheus/tasks/main.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								roles/prometheus/tasks/main.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,42 @@
 | 
			
		||||
---
 | 
			
		||||
- name: Install Prometheus
 | 
			
		||||
  apt:
 | 
			
		||||
    update_cache: true
 | 
			
		||||
    name: prometheus
 | 
			
		||||
  register: apt_result
 | 
			
		||||
  retries: 3
 | 
			
		||||
  until: apt_result is succeeded
 | 
			
		||||
 | 
			
		||||
- name: Configure Prometheus
 | 
			
		||||
  template:
 | 
			
		||||
    src: prometheus/prometheus.yml.j2
 | 
			
		||||
    dest: /etc/prometheus/prometheus.yml
 | 
			
		||||
    mode: 0644
 | 
			
		||||
  notify: Restart Prometheus
 | 
			
		||||
 | 
			
		||||
- name: Configure Prometheus alert rules
 | 
			
		||||
  template:
 | 
			
		||||
    src: prometheus/alert.rules.yml.j2
 | 
			
		||||
    dest: /etc/prometheus/alert.rules.yml
 | 
			
		||||
    mode: 0644
 | 
			
		||||
  notify: Restart Prometheus
 | 
			
		||||
 | 
			
		||||
# We don't need to restart Prometheus when updating nodes
 | 
			
		||||
- name: Configure Prometheus targets
 | 
			
		||||
  copy:
 | 
			
		||||
    content: "{{ [{'targets': item.value.targets}] | to_nice_json }}\n"
 | 
			
		||||
    dest: "/etc/prometheus/{{ item.value.file }}"
 | 
			
		||||
    mode: 0644
 | 
			
		||||
  loop: "{{ prometheus | dict2items }}"
 | 
			
		||||
 | 
			
		||||
- name: Activate prometheus service
 | 
			
		||||
  systemd:
 | 
			
		||||
    name: prometheus
 | 
			
		||||
    enabled: true
 | 
			
		||||
    state: started
 | 
			
		||||
 | 
			
		||||
- name: Indicate role in motd
 | 
			
		||||
  template:
 | 
			
		||||
    src: update-motd.d/05-service.j2
 | 
			
		||||
    dest: /etc/update-motd.d/05-prometheus
 | 
			
		||||
    mode: 0755
 | 
			
		||||
							
								
								
									
										187
									
								
								roles/prometheus/templates/prometheus/alert.rules.yml.j2
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										187
									
								
								roles/prometheus/templates/prometheus/alert.rules.yml.j2
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,187 @@
 | 
			
		||||
{{ ansible_header | comment }}
 | 
			
		||||
{# As this is also Jinja2 it will conflict without a raw block #}
 | 
			
		||||
{# Depending of Prometheus Node exporter version, rules can change depending of version #}
 | 
			
		||||
{% raw %}
 | 
			
		||||
groups:
 | 
			
		||||
- name: alert.rules
 | 
			
		||||
  rules:
 | 
			
		||||
 | 
			
		||||
  # Alert for any instance that is unreachable for >3 minutes.
 | 
			
		||||
  - alert: InstanceDown
 | 
			
		||||
    expr: up == 0
 | 
			
		||||
    for: 3m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: critical
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "{{ $labels.instance }} ({{ $labels.job }}) est invisible depuis plus de 3 minutes !"
 | 
			
		||||
 | 
			
		||||
  # Alert for out of memory
 | 
			
		||||
  # Do not take into account memory not used by apps
 | 
			
		||||
  - alert: OutOfMemory
 | 
			
		||||
    expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_PageTables_bytes + node_memory_VmallocUsed_bytes + node_memory_SwapCached_bytes + node_memory_Slab_bytes) / node_memory_MemTotal_bytes * 100 < 10
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "Mémoire libre de {{ $labels.instance }} à {{ $value }}%."
 | 
			
		||||
 | 
			
		||||
  # Alert for out of disk space
 | 
			
		||||
  - alert: OutOfDiskSpace
 | 
			
		||||
    expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ $value }}%."
 | 
			
		||||
 | 
			
		||||
  # Alert for out of inode space on disk
 | 
			
		||||
  - alert: OutOfInodes
 | 
			
		||||
    expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}."
 | 
			
		||||
 | 
			
		||||
  # Alert for high CPU usage
 | 
			
		||||
  - alert: CpuBusy
 | 
			
		||||
    expr: node_load5 > 9
 | 
			
		||||
    for: 10m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "Charge sur {{ $labels.instance }} à {{ $value }}."
 | 
			
		||||
 | 
			
		||||
  # Check mdadm software RAID
 | 
			
		||||
  - alert: SoftwareRAIDDegraded
 | 
			
		||||
    expr: node_md_disks-node_md_disks_active > 0
 | 
			
		||||
    for: 3m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)."
 | 
			
		||||
 | 
			
		||||
  # Check systemd unit (> buster)
 | 
			
		||||
  - alert: SystemdServiceFailed
 | 
			
		||||
    expr: node_systemd_unit_state{state="failed"} == 1
 | 
			
		||||
    for: 10m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}"
 | 
			
		||||
 | 
			
		||||
  # Check UPS
 | 
			
		||||
  - alert: UpsOutputSourceChanged
 | 
			
		||||
    expr: upsOutputSource != 3
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "La source d'alimentation de {{ $labels.instance }} a changé !"
 | 
			
		||||
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
 | 
			
		||||
 | 
			
		||||
  - alert: UpsBatteryStatusChanged
 | 
			
		||||
    expr: upsBatteryStatus != 2
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "L'état de la batterie de {{ $labels.instance }} a changé !"
 | 
			
		||||
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
 | 
			
		||||
 | 
			
		||||
  - alert: UpsTemperatureWarning
 | 
			
		||||
    expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26)
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C."
 | 
			
		||||
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
 | 
			
		||||
 | 
			
		||||
  - alert: UpsTemperatureCritical
 | 
			
		||||
    expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30)
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: critical
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C !"
 | 
			
		||||
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
 | 
			
		||||
 | 
			
		||||
  - alert: UpsHighHumidity
 | 
			
		||||
    expr: xupsEnvRemoteHumidity > 65
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}%."
 | 
			
		||||
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
 | 
			
		||||
 | 
			
		||||
  - alert: UpsVeryHighHumidity
 | 
			
		||||
    expr: xupsEnvRemoteHumidity > 85
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: critical
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}% !"
 | 
			
		||||
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
 | 
			
		||||
 | 
			
		||||
  - alert: UpsHighLoad
 | 
			
		||||
    expr: upsOutputPercentLoad > 70
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: critical
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !"
 | 
			
		||||
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
 | 
			
		||||
 | 
			
		||||
  - alert: UpsWrongInputVoltage
 | 
			
		||||
    expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V."
 | 
			
		||||
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
 | 
			
		||||
 | 
			
		||||
  - alert: UpsWrongOutputVoltage
 | 
			
		||||
    expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245)
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V."
 | 
			
		||||
      description: "https://grafana.crans.org/d/qtbg59mZz/alimentation"
 | 
			
		||||
 | 
			
		||||
  - alert: AptAutoremovePending
 | 
			
		||||
    expr: apt_autoremove_pending > 0
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}."
 | 
			
		||||
 | 
			
		||||
  - alert: MailqNotEmpty
 | 
			
		||||
    expr: postfix_mailq_length > 25
 | 
			
		||||
    for: 1m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "{{ $value }} mails dans la mailq sur {{ $labels.instance }}."
 | 
			
		||||
 | 
			
		||||
  - alert: NoRadiusLogin
 | 
			
		||||
    expr: rate(radiusd_access_ok[3m]) == 0
 | 
			
		||||
    for: 2m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "Personne ne vient taper le RADIUS."
 | 
			
		||||
 | 
			
		||||
  - alert: TooManyReallocatedSectors
 | 
			
		||||
    expr: smartmon_reallocated_sector_ct_raw_value > 1e3
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués."
 | 
			
		||||
 | 
			
		||||
{% endraw %}
 | 
			
		||||
							
								
								
									
										42
									
								
								roles/prometheus/templates/prometheus/prometheus.yml.j2
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								roles/prometheus/templates/prometheus/prometheus.yml.j2
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,42 @@
 | 
			
		||||
{{ ansible_header | comment }}
 | 
			
		||||
 | 
			
		||||
global:
 | 
			
		||||
  # scrape_interval is set to the global default (60s)
 | 
			
		||||
  # evaluation_interval is set to the global default (60s)
 | 
			
		||||
  # scrape_timeout is set to the global default (10s).
 | 
			
		||||
 | 
			
		||||
  # Attach these labels to any time series or alerts when communicating with
 | 
			
		||||
  # external systems (federation, remote storage, Alertmanager).
 | 
			
		||||
  external_labels:
 | 
			
		||||
      monitor: 'example'
 | 
			
		||||
 | 
			
		||||
# Alertmanager configuration
 | 
			
		||||
# Use prometheus alertmanager installed on the same machine
 | 
			
		||||
alerting:
 | 
			
		||||
  alertmanagers:
 | 
			
		||||
  - static_configs:
 | 
			
		||||
    - targets: ['localhost:9093']
 | 
			
		||||
 | 
			
		||||
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
 | 
			
		||||
rule_files:
 | 
			
		||||
  - "alert.rules.yml"  # Monitoring alerts, this is the file you may be searching!
 | 
			
		||||
 | 
			
		||||
# A scrape configuration containing exactly one endpoint to scrape:
 | 
			
		||||
# Here it's Prometheus itself.
 | 
			
		||||
{{
 | 
			
		||||
  {
 | 
			
		||||
    "scrape_configs":
 | 
			
		||||
    [
 | 
			
		||||
      {
 | 
			
		||||
        "job_name": "prometheus",
 | 
			
		||||
        "static_configs" : [
 | 
			
		||||
          {
 | 
			
		||||
            "targets": [
 | 
			
		||||
               "localhost:9090"
 | 
			
		||||
            ]
 | 
			
		||||
          }
 | 
			
		||||
        ]
 | 
			
		||||
      }
 | 
			
		||||
    ] + (prometheus | json_query("*.config[0]"))
 | 
			
		||||
  } | to_nice_yaml(indent=2)
 | 
			
		||||
}}
 | 
			
		||||
							
								
								
									
										3
									
								
								roles/prometheus/templates/update-motd.d/05-service.j2
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										3
									
								
								roles/prometheus/templates/update-motd.d/05-service.j2
									
									
									
									
									
										Executable file
									
								
							@@ -0,0 +1,3 @@
 | 
			
		||||
#!/usr/bin/tail +14
 | 
			
		||||
{{ ansible_header | comment }}
 | 
			
		||||
[0m> [38;5;82mprometheus[0m a été déployé sur cette machine. Voir [38;5;6m/etc/prometheus/[0m.
 | 
			
		||||
		Reference in New Issue
	
	Block a user