First somewhat working prototype - only monitoring

This commit is contained in:
Davide Oddone 2023-12-31 19:42:33 +01:00
commit 009f00a422
15 changed files with 451 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
print_facts/
roles/common
debug.yml

4
cluster.yml Normal file
View File

@ -0,0 +1,4 @@
---
- hosts: all
roles:
- monitoring

7
host_vars/rock64.yml Normal file
View File

@ -0,0 +1,7 @@
$ANSIBLE_VAULT;1.1;AES256
31363536343633356337316532313364373738373938386537623030353663356636643332306565
6264633065623966323638366334316333373334363935300a666132353262326532616437653266
33396338326266373662646133333539356366396133316262326266363962366236383639346366
3336363035383931310a656138313033303662363935313039303837653233323265343832383935
64633635623431623561633665616436616231306264353465353637343039363432636432333634
3361633938323730333337353264363761326633383864303464

7
host_vars/rockpro64.yml Normal file
View File

@ -0,0 +1,7 @@
$ANSIBLE_VAULT;1.1;AES256
35336163336431326538313432323733383261653562323139363036663263653939633437323232
3134393539633036383563643563656238626164376337660a613630306164396133633831306630
34323831323631633064363634616530353730396238383031646333366463653231393638643462
3662633431306238370a373663353966313162373937333238653838393739376334616135336133
61393166373136343839383363613439633062646138656636643161366533636330393633343333
6634326334303933306233613833616232376462306437663165

5
hosts Normal file
View File

@ -0,0 +1,5 @@
[main]
rock64 ansible_connection=local ansible_host=localhost ansible_user=doddo hostname=rock64_1
[worker]
rockpro64 ansible_host=192.168.0.2 ansible_user=doddo hostname=rockpro64_4G

View File

@ -0,0 +1 @@
See https://github.com/netdata/netdata/tree/master/packaging/installer/methods/ansible.md

View File

@ -0,0 +1,6 @@
# Restart Netdata
- name: Restart Netdata
become: true
service:
name: netdata
state: restarted

View File

@ -0,0 +1,31 @@
---
- name: Claim to Netdata Cloud
block:
- name: Claim to Netdata Cloud if not already
shell:
cmd: netdata-claim.sh -token={{ claim_token }} -rooms={{ claim_rooms }} -url={{ claim_url }}
creates: /var/lib/netdata/cloud.d/claimed_id
become: yes
when: reclaim == false
- name: Re-claim a node to Netdata Cloud
block:
- name: Ensure `uuidgen` is installed
stat:
path: /usr/bin/uuidgen
register: uuidgen_result
- name: Fail if `uuidgen` is not installed
fail:
msg: The system needs `uuidgen` installed to enable re-claiming.
when: uuidgen_result.stat.exists == false
- name: Reclaim the node with `-id=`
shell: netdata-claim.sh -token={{ claim_token }} -rooms={{ claim_rooms }} -url={{ claim_url }} -id=$(uuidgen)
when: uuidgen_result.stat.exists == true
notify: Restart Netdata
become: yes
when: reclaim == true

View File

@ -0,0 +1,15 @@
---
- template:
src: ../templates/netdata.conf.j2
dest: /etc/netdata/netdata.conf
owner: root
group: root
mode: u=wrx,g=rx,o=r,+x
- template:
src: ../templates/stream.conf.j2
dest: /etc/netdata/stream.conf
owner: root
group: root
mode: u=wrx,g=rx,o=r,+x
notify: Restart Netdata
become: true

View File

@ -0,0 +1,14 @@
---
- name: Download the installation script
get_url:
url: https://my-netdata.io/kickstart.sh
dest: ~/kickstart.sh
mode: +x
- name: Install Netdata
command: ~/kickstart.sh --dont-wait
- name: Cleanup installation script
file:
path: ~/kickstart.sh
state: absent

View File

@ -0,0 +1,16 @@
---
# Tasks file for Netdata
- name: Install Netdata
become: true
become_method: sudo
import_tasks: install.yml
- name: Configure Netdata
become: true
become_method: sudo
import_tasks: configure.yml
- name: Claim the node to Netdata Cloud
become: true
become_method: sudo
import_tasks: claim.yml

View File

@ -0,0 +1,21 @@
# Netdata configuration
[global]
{% if hostvars[inventory_hostname].hostname %}
hostname = {{ hostvars[inventory_hostname].hostname }}
{% endif %}
dbengine multihost disk space = {{ dbengine_multihost_disk_space }}
{% if 'worker' in hostvars[inventory_hostname].group_names %}
memory mode = none
{% endif %}
[web]
mode = {{ 'none' if 'worker' in hostvars[inventory_hostname].group_names else 'static-threaded' }}
{% if 'main' in hostvars[inventory_hostname].group_names %}
bind to = localhost {{ hostvars['rock64']['ansible_facts']['end0']['ipv4']['address'] }}:19998=streaming
{% endif %}
{% if 'worker' in hostvars[inventory_hostname].group_names %}
[cloud]
proxy = http://{{ hostvars['rock64']['ansible_facts']['end0']['ipv4']['address'] }}:3128
{% endif %}

View File

@ -0,0 +1,265 @@
# netdata configuration for aggregating data from remote hosts
#
# API keys authorize a pair of sending-receiving netdata servers.
# Once their communication is authorized, they can exchange metrics for any
# number of hosts.
#
# You can generate API keys, with the linux command: uuidgen
# -----------------------------------------------------------------------------
# 1. ON CHILD NETDATA - THE ONE THAT WILL BE SENDING METRICS
[stream]
# Enable this on child nodes, to have them send metrics.
enabled = {{ 'no' if 'main' in hostvars[inventory_hostname].group_names else 'yes' }}
# Where is the receiving netdata?
# A space separated list of:
#
# [PROTOCOL:]HOST[%INTERFACE][:PORT][:SSL]
#
# If many are given, the first available will get the metrics.
#
# PROTOCOL = tcp, udp, or unix (only tcp and unix are supported by parent nodes)
# HOST = an IPv4, IPv6 IP, or a hostname, or a unix domain socket path.
# IPv6 IPs should be given with brackets [ip:address]
# INTERFACE = the network interface to use (only for IPv6)
# PORT = the port number or service name (/etc/services)
# SSL = when this word appear at the end of the destination string
# the Netdata will encrypt the connection with the parent.
#
# This communication is not HTTP (it cannot be proxied by web proxies).
destination = {{ '' if 'main' in hostvars[inventory_hostname].group_names else 'tcp:' ~ hostvars['rock64']['ansible_facts']['end0']['ipv4']['address'] ~ ':19998' }}
# Skip Certificate verification?
# The netdata child is configurated to avoid invalid SSL/TLS certificate,
# so certificates that are self-signed or expired will stop the streaming.
# Case the server certificate is not valid, you can enable the use of
# 'bad' certificates setting the next option as 'yes'.
#ssl skip certificate verification = yes
# Certificate Authority Path
# OpenSSL has a default directory where the known certificates are stored.
# In case it is necessary, it is possible to change this rule using the variable
# "CApath", e.g. CApath = /etc/ssl/certs/
#
#CApath =
# Certificate Authority file
# When the Netdata parent has a certificate that is not recognized as valid,
# we can add it to the list of known certificates in "CApath" and give it to
# Netdata as an argument, e.g. CAfile = /etc/ssl/certs/cert.pem
#
#CAfile =
# The API_KEY to use (as the sender)
api key = {{ '' if 'main' in hostvars[inventory_hostname].group_names else api_key }}
# Stream Compression
# The default is enabled
# You can control stream compression in this agent with options: yes | no
#enable compression = yes
# The timeout to connect and send metrics
timeout seconds = 60
# If the destination line above does not specify a port, use this
default port = 19999
# filter the charts to be streamed
# netdata SIMPLE PATTERN:
# - space separated list of patterns (use \ to include spaces in patterns)
# - use * as wildcard, any number of times within each pattern
# - prefix a pattern with ! for a negative match (ie not stream the charts it matches)
# - the order of patterns is important (left to right)
# To send all except a few, use: !this !that * (ie append a wildcard pattern)
send charts matching = *
# The buffer to use for sending metrics.
# 10MB is good for 60 seconds of data, so increase this if you expect latencies.
# The buffer is flushed on reconnects (this will not prevent gaps at the charts).
buffer size bytes = 10485760
# If the connection fails, or it disconnects,
# retry after that many seconds.
reconnect delay seconds = 5
# Sync the clock of the charts for that many iterations, when starting.
# It is ignored when replication is enabled
initial clock resync iterations = 60
# -----------------------------------------------------------------------------
# 2. ON PARENT NETDATA - THE ONE THAT WILL BE RECEIVING METRICS
# You can have one API key per child,
# or the same API key for all child nodes.
#
# netdata searches for options in this order:
#
# a) parent netdata settings (netdata.conf)
# b) [stream] section (above)
# c) [API_KEY] section (below, settings for the API key)
# d) [MACHINE_GUID] section (below, settings for each machine)
#
# You can combine the above (the more specific setting will be used).
# API key authentication
# If the key is not listed here, it will not be able to push metrics.
# [API_KEY] is [YOUR-API-KEY], i.e [11111111-2222-3333-4444-555555555555]
{{ '[' ~ api_key ~ ']' if 'main' in hostvars[inventory_hostname].group_names else '[API_KEY]' }}
# Default settings for this API key
# This GUID is to be used as an API key from remote agents connecting
# to this machine. Failure to match such a key, denies access.
# YOU MUST SET THIS FIELD ON ALL API KEYS.
type = api
# You can disable the API key, by setting this to: no
# The default (for unknown API keys) is: no
enabled = {{ 'yes' if 'main' in hostvars[inventory_hostname].group_names else 'no' }}
# A list of simple patterns matching the IPs of the servers that
# will be pushing metrics using this API key.
# The metrics are received via the API port, so the same IPs
# should also be matched at netdata.conf [web].allow connections from
allow from = *
# The default history in entries, for all hosts using this API key.
# You can also set it per host below.
# For the default db mode (dbengine), this is ignored.
#default history = 3600
# The default memory mode to be used for all hosts using this API key.
# You can also set it per host below.
# If you don't set it here, the memory mode of netdata.conf will be used.
# Valid modes:
# save save on exit, load on start
# map like swap (continuously syncing to disks - you need SSD)
# ram keep it in RAM, don't touch the disk
# none no database at all (use this on headless proxies)
# dbengine like a traditional database
{{ 'default memory mode = dbengine' if 'main' in hostvars[inventory_hostname].group_names else '' }}
# Shall we enable health monitoring for the hosts using this API key?
# 3 possible values:
# yes enable alarms
# no do not enable alarms
# auto enable alarms, only when the sending netdata is connected.
# Health monitoring will be disabled as soon as the connection is closed.
# You can also set it per host, below.
# The default is taken from [health].enabled of netdata.conf
#health enabled by default = auto
# postpone alarms for a short period after the sender is connected
default postpone alarms on connect seconds = 60
# seconds of health log events to keep
#default health log history = 432000
# need to route metrics differently? set these.
# the defaults are the ones at the [stream] section (above)
#default proxy enabled = yes | no
#default proxy destination = IP:PORT IP:PORT ...
#default proxy api key = API_KEY
#default proxy send charts matching = *
# Stream Compression
# By default it is enabled.
# You can control stream compression in this parent agent stream with options: yes | no
#enable compression = yes
# select the order the compression algorithms will be used, when multiple are offered by the child
#compression algorithms order = zstd lz4 brotli gzip
# Replication
# Enable replication for all hosts using this api key. Default: enabled
#enable replication = yes
# How many seconds to replicate from each child. Default: a day
#seconds to replicate = 86400
# The duration we want to replicate per each step.
#replication_step = 600
# Indicate whether this child is an ephemeral node. An ephemeral node will become unavailable
# after the specified duration of "cleanup ephemeral hosts after secs" (as defined in the db section of netdata.conf)
# from the time of the node's last connection.
#is ephemeral node = false
# -----------------------------------------------------------------------------
# 3. PER SENDING HOST SETTINGS, ON PARENT NETDATA
# THIS IS OPTIONAL - YOU DON'T HAVE TO CONFIGURE IT
# This section exists to give you finer control of the parent settings for each
# child host, when the same API key is used by many netdata child nodes / proxies.
#
# Each netdata has a unique GUID - generated the first time netdata starts.
# You can find it at /var/lib/netdata/registry/netdata.public.unique.id
# (at the child).
#
# The host sending data will have one. If the host is not ephemeral,
# you can give settings for each sending host here.
[MACHINE_GUID]
# This GUID is to be used as a MACHINE GUID from remote agents connecting
# to this machine, not an API key.
# YOU MUST SET THIS FIELD ON ALL MACHINE GUIDs.
type = machine
# enable this host: yes | no
# When disabled, the parent will not receive metrics for this host.
# THIS IS NOT A SECURITY MECHANISM - AN ATTACKER CAN SET ANY OTHER GUID.
# Use only the API key for security.
enabled = no
# A list of simple patterns matching the IPs of the servers that
# will be pushing metrics using this MACHINE GUID.
# The metrics are received via the API port, so the same IPs
# should also be matched at netdata.conf [web].allow connections from
# and at stream.conf [API_KEY].allow from
allow from = *
# The number of entries in the database.
# This is ignored for db mode dbengine.
#history = 3600
# The memory mode of the database: save | map | ram | none | dbengine
#memory mode = dbengine
# Health / alarms control: yes | no | auto
#health enabled = auto
# postpone alarms when the sender connects
postpone alarms on connect seconds = 60
# seconds of health log events to keep
#health log history = 432000
# need to route metrics differently?
# the defaults are the ones at the [API KEY] section
#proxy enabled = yes | no
#proxy destination = IP:PORT IP:PORT ...
#proxy api key = API_KEY
#proxy send charts matching = *
# Stream Compression
# By default, enabled.
# You can control stream compression in this parent agent stream with options: yes | no
#enable compression = yes
# Replication
# Enable replication for all hosts using this api key.
#enable replication = yes
# How many seconds to replicate from each child.
#seconds to replicate = 86400
# The duration we want to replicate per each step.
#replication_step = 600
# Indicate whether this child is an ephemeral node. An ephemeral node will become unavailable
# after the specified duration of "cleanup ephemeral hosts after secs" (as defined in the db section of netdata.conf)
# from the time of the node's last connection.
#is ephemeral node = false

View File

@ -0,0 +1,27 @@
---
# Variables for Netdata
# Set Netdata Cloud claiming details. To find your `claim_token` and
# `claim_room`, go to Netdata Cloud, then click on your Space's name in the top
# navigation, then click on `Manage your Space`. Click on the `Nodes` tab in the
# panel that appears, which displays a script with `token` and `room` strings.
# Copy those strings into the variables below. `claim_url` should be
# `https://app.netdata.cloud`. Read more:
# https://learn.netdata.cloud/docs/agent/claim
claim_url: https://app.netdata.cloud
# Force re-claiming of nodes to Netdata Cloud. Read more:
# https://learn.netdata.cloud/docs/agent/claim#remove-and-reclaim-a-node
reclaim: false
# Set Netdata's metrics retention policy via the disk size for the database
# engine. Value is in MiB. Read more:
# https://learn.netdata.cloud/docs/store/change-metrics-storage
dbengine_multihost_disk_space: 2048
# Set whether to run the Agent web server/dashboard/API, or disable them.
# Because we're connecting this node to Netdata Cloud and will view dashboards
# there, we'll set this to `none` to disable the local dashboard. Set to
# `static-threaded` if you want to keep it running. Read more:
# https://learn.netdata.cloud/docs/configure/secure-nodes
web_mode: none

View File

@ -0,0 +1,29 @@
claim_token: !vault |
$ANSIBLE_VAULT;1.1;AES256
62653264633831346161393763666666636535386239636231393831353130633138313666336435
6530306263613836356163376537393165633963376563390a316164373033373162646266613164
39646237323830626539386231313435393131363239376538383732646636303439616132353266
6634386563383837630a626266623337353932316666366538323835663136633930623636333131
66636537363731313232626666323264366464343261333633333233326165663434353136623334
35323261613866643139303432646537376132656237323462396237346166306666653531616462
61663864656130386562623136613166303462666237333230343132363864306165623631373034
63323666383362326431323539363633346464626163666435363236316439366338336339646636
62336633386438653834653361326462383234386466663335633064663638666461666365363461
30353735376566323861663431396164646665323563393363663637653134346130343336363631
336164663430653563353835336464346530
claim_rooms: !vault |
$ANSIBLE_VAULT;1.1;AES256
66396339663662373339323136386635306130656365343263666235653630636663336233383162
6335643763383037386636376362656565383365626435370a623266636136356334396335306135
31346162346662383033373031653766356436343037353534383939396163333739633964636463
6663343665303562330a313961626165333762646136356131333466643364373038353735346462
65366533353733333264383534653734663932643765393863623934316461383034666137653366
3033636130363731343337643763336536663437343865386131
api_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
64626631376635366130646332386139646661386538653737383632303732653735613766613664
3133623739643763386537383537623837343762376265370a663837653135363732313231626664
33333765663039313866303665623663363062646432343539383434633631303239306664636537
3835643563663638360a636636393130656463353563343233373864356266363564663735373934
65326233656166386638616564373266393434623261653037353435373133663261353233353832
3234353835616133396565646439653363303133613932633065