commit 009f00a422f932da586ae28a24fa7e0b0ecc2943 Author: Davide Oddone Date: Sun Dec 31 19:42:33 2023 +0100 First somewhat working prototype - only monitoring diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..03fba6d --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +print_facts/ +roles/common +debug.yml diff --git a/cluster.yml b/cluster.yml new file mode 100644 index 0000000..01e05b9 --- /dev/null +++ b/cluster.yml @@ -0,0 +1,4 @@ +--- +- hosts: all + roles: + - monitoring diff --git a/host_vars/rock64.yml b/host_vars/rock64.yml new file mode 100644 index 0000000..58dfefe --- /dev/null +++ b/host_vars/rock64.yml @@ -0,0 +1,7 @@ +$ANSIBLE_VAULT;1.1;AES256 +31363536343633356337316532313364373738373938386537623030353663356636643332306565 +6264633065623966323638366334316333373334363935300a666132353262326532616437653266 +33396338326266373662646133333539356366396133316262326266363962366236383639346366 +3336363035383931310a656138313033303662363935313039303837653233323265343832383935 +64633635623431623561633665616436616231306264353465353637343039363432636432333634 +3361633938323730333337353264363761326633383864303464 diff --git a/host_vars/rockpro64.yml b/host_vars/rockpro64.yml new file mode 100644 index 0000000..75a3c93 --- /dev/null +++ b/host_vars/rockpro64.yml @@ -0,0 +1,7 @@ +$ANSIBLE_VAULT;1.1;AES256 +35336163336431326538313432323733383261653562323139363036663263653939633437323232 +3134393539633036383563643563656238626164376337660a613630306164396133633831306630 +34323831323631633064363634616530353730396238383031646333366463653231393638643462 +3662633431306238370a373663353966313162373937333238653838393739376334616135336133 +61393166373136343839383363613439633062646138656636643161366533636330393633343333 +6634326334303933306233613833616232376462306437663165 diff --git a/hosts b/hosts new file mode 100644 index 0000000..8c92daf --- /dev/null +++ b/hosts @@ -0,0 +1,5 @@ +[main] +rock64 ansible_connection=local ansible_host=localhost ansible_user=doddo hostname=rock64_1 + +[worker] +rockpro64 ansible_host=192.168.0.2 ansible_user=doddo hostname=rockpro64_4G diff --git a/roles/monitoring/README.md b/roles/monitoring/README.md new file mode 100644 index 0000000..23179a0 --- /dev/null +++ b/roles/monitoring/README.md @@ -0,0 +1 @@ +See https://github.com/netdata/netdata/tree/master/packaging/installer/methods/ansible.md diff --git a/roles/monitoring/handlers/main.yml b/roles/monitoring/handlers/main.yml new file mode 100644 index 0000000..423d34e --- /dev/null +++ b/roles/monitoring/handlers/main.yml @@ -0,0 +1,6 @@ +# Restart Netdata +- name: Restart Netdata + become: true + service: + name: netdata + state: restarted diff --git a/roles/monitoring/tasks/claim.yml b/roles/monitoring/tasks/claim.yml new file mode 100644 index 0000000..19f1d25 --- /dev/null +++ b/roles/monitoring/tasks/claim.yml @@ -0,0 +1,31 @@ +--- +- name: Claim to Netdata Cloud + block: + + - name: Claim to Netdata Cloud if not already + shell: + cmd: netdata-claim.sh -token={{ claim_token }} -rooms={{ claim_rooms }} -url={{ claim_url }} + creates: /var/lib/netdata/cloud.d/claimed_id + become: yes + + when: reclaim == false + +- name: Re-claim a node to Netdata Cloud + block: + + - name: Ensure `uuidgen` is installed + stat: + path: /usr/bin/uuidgen + register: uuidgen_result + + - name: Fail if `uuidgen` is not installed + fail: + msg: The system needs `uuidgen` installed to enable re-claiming. + when: uuidgen_result.stat.exists == false + + - name: Reclaim the node with `-id=` + shell: netdata-claim.sh -token={{ claim_token }} -rooms={{ claim_rooms }} -url={{ claim_url }} -id=$(uuidgen) + when: uuidgen_result.stat.exists == true + notify: Restart Netdata + become: yes + when: reclaim == true diff --git a/roles/monitoring/tasks/configure.yml b/roles/monitoring/tasks/configure.yml new file mode 100644 index 0000000..90070b0 --- /dev/null +++ b/roles/monitoring/tasks/configure.yml @@ -0,0 +1,15 @@ +--- +- template: + src: ../templates/netdata.conf.j2 + dest: /etc/netdata/netdata.conf + owner: root + group: root + mode: u=wrx,g=rx,o=r,+x +- template: + src: ../templates/stream.conf.j2 + dest: /etc/netdata/stream.conf + owner: root + group: root + mode: u=wrx,g=rx,o=r,+x + notify: Restart Netdata + become: true diff --git a/roles/monitoring/tasks/install.yml b/roles/monitoring/tasks/install.yml new file mode 100644 index 0000000..a37f7ca --- /dev/null +++ b/roles/monitoring/tasks/install.yml @@ -0,0 +1,14 @@ +--- +- name: Download the installation script + get_url: + url: https://my-netdata.io/kickstart.sh + dest: ~/kickstart.sh + mode: +x + +- name: Install Netdata + command: ~/kickstart.sh --dont-wait + +- name: Cleanup installation script + file: + path: ~/kickstart.sh + state: absent diff --git a/roles/monitoring/tasks/main.yml b/roles/monitoring/tasks/main.yml new file mode 100644 index 0000000..54685e8 --- /dev/null +++ b/roles/monitoring/tasks/main.yml @@ -0,0 +1,16 @@ +--- +# Tasks file for Netdata +- name: Install Netdata + become: true + become_method: sudo + import_tasks: install.yml + +- name: Configure Netdata + become: true + become_method: sudo + import_tasks: configure.yml + +- name: Claim the node to Netdata Cloud + become: true + become_method: sudo + import_tasks: claim.yml diff --git a/roles/monitoring/templates/netdata.conf.j2 b/roles/monitoring/templates/netdata.conf.j2 new file mode 100644 index 0000000..a508402 --- /dev/null +++ b/roles/monitoring/templates/netdata.conf.j2 @@ -0,0 +1,21 @@ +# Netdata configuration + +[global] +{% if hostvars[inventory_hostname].hostname %} + hostname = {{ hostvars[inventory_hostname].hostname }} +{% endif %} + dbengine multihost disk space = {{ dbengine_multihost_disk_space }} +{% if 'worker' in hostvars[inventory_hostname].group_names %} + memory mode = none +{% endif %} + +[web] + mode = {{ 'none' if 'worker' in hostvars[inventory_hostname].group_names else 'static-threaded' }} +{% if 'main' in hostvars[inventory_hostname].group_names %} + bind to = localhost {{ hostvars['rock64']['ansible_facts']['end0']['ipv4']['address'] }}:19998=streaming +{% endif %} + +{% if 'worker' in hostvars[inventory_hostname].group_names %} +[cloud] + proxy = http://{{ hostvars['rock64']['ansible_facts']['end0']['ipv4']['address'] }}:3128 +{% endif %} diff --git a/roles/monitoring/templates/stream.conf.j2 b/roles/monitoring/templates/stream.conf.j2 new file mode 100644 index 0000000..3841fcd --- /dev/null +++ b/roles/monitoring/templates/stream.conf.j2 @@ -0,0 +1,265 @@ +# netdata configuration for aggregating data from remote hosts +# +# API keys authorize a pair of sending-receiving netdata servers. +# Once their communication is authorized, they can exchange metrics for any +# number of hosts. +# +# You can generate API keys, with the linux command: uuidgen + + +# ----------------------------------------------------------------------------- +# 1. ON CHILD NETDATA - THE ONE THAT WILL BE SENDING METRICS + +[stream] + # Enable this on child nodes, to have them send metrics. + enabled = {{ 'no' if 'main' in hostvars[inventory_hostname].group_names else 'yes' }} + + # Where is the receiving netdata? + # A space separated list of: + # + # [PROTOCOL:]HOST[%INTERFACE][:PORT][:SSL] + # + # If many are given, the first available will get the metrics. + # + # PROTOCOL = tcp, udp, or unix (only tcp and unix are supported by parent nodes) + # HOST = an IPv4, IPv6 IP, or a hostname, or a unix domain socket path. + # IPv6 IPs should be given with brackets [ip:address] + # INTERFACE = the network interface to use (only for IPv6) + # PORT = the port number or service name (/etc/services) + # SSL = when this word appear at the end of the destination string + # the Netdata will encrypt the connection with the parent. + # + # This communication is not HTTP (it cannot be proxied by web proxies). + destination = {{ '' if 'main' in hostvars[inventory_hostname].group_names else 'tcp:' ~ hostvars['rock64']['ansible_facts']['end0']['ipv4']['address'] ~ ':19998' }} + + # Skip Certificate verification? + # The netdata child is configurated to avoid invalid SSL/TLS certificate, + # so certificates that are self-signed or expired will stop the streaming. + # Case the server certificate is not valid, you can enable the use of + # 'bad' certificates setting the next option as 'yes'. + #ssl skip certificate verification = yes + + # Certificate Authority Path + # OpenSSL has a default directory where the known certificates are stored. + # In case it is necessary, it is possible to change this rule using the variable + # "CApath", e.g. CApath = /etc/ssl/certs/ + # + #CApath = + + # Certificate Authority file + # When the Netdata parent has a certificate that is not recognized as valid, + # we can add it to the list of known certificates in "CApath" and give it to + # Netdata as an argument, e.g. CAfile = /etc/ssl/certs/cert.pem + # + #CAfile = + + # The API_KEY to use (as the sender) + api key = {{ '' if 'main' in hostvars[inventory_hostname].group_names else api_key }} + + # Stream Compression + # The default is enabled + # You can control stream compression in this agent with options: yes | no + #enable compression = yes + + # The timeout to connect and send metrics + timeout seconds = 60 + + # If the destination line above does not specify a port, use this + default port = 19999 + + # filter the charts to be streamed + # netdata SIMPLE PATTERN: + # - space separated list of patterns (use \ to include spaces in patterns) + # - use * as wildcard, any number of times within each pattern + # - prefix a pattern with ! for a negative match (ie not stream the charts it matches) + # - the order of patterns is important (left to right) + # To send all except a few, use: !this !that * (ie append a wildcard pattern) + send charts matching = * + + # The buffer to use for sending metrics. + # 10MB is good for 60 seconds of data, so increase this if you expect latencies. + # The buffer is flushed on reconnects (this will not prevent gaps at the charts). + buffer size bytes = 10485760 + + # If the connection fails, or it disconnects, + # retry after that many seconds. + reconnect delay seconds = 5 + + # Sync the clock of the charts for that many iterations, when starting. + # It is ignored when replication is enabled + initial clock resync iterations = 60 + +# ----------------------------------------------------------------------------- +# 2. ON PARENT NETDATA - THE ONE THAT WILL BE RECEIVING METRICS + +# You can have one API key per child, +# or the same API key for all child nodes. +# +# netdata searches for options in this order: +# +# a) parent netdata settings (netdata.conf) +# b) [stream] section (above) +# c) [API_KEY] section (below, settings for the API key) +# d) [MACHINE_GUID] section (below, settings for each machine) +# +# You can combine the above (the more specific setting will be used). + +# API key authentication +# If the key is not listed here, it will not be able to push metrics. + +# [API_KEY] is [YOUR-API-KEY], i.e [11111111-2222-3333-4444-555555555555] +{{ '[' ~ api_key ~ ']' if 'main' in hostvars[inventory_hostname].group_names else '[API_KEY]' }} + # Default settings for this API key + + # This GUID is to be used as an API key from remote agents connecting + # to this machine. Failure to match such a key, denies access. + # YOU MUST SET THIS FIELD ON ALL API KEYS. + type = api + + # You can disable the API key, by setting this to: no + # The default (for unknown API keys) is: no + enabled = {{ 'yes' if 'main' in hostvars[inventory_hostname].group_names else 'no' }} + + # A list of simple patterns matching the IPs of the servers that + # will be pushing metrics using this API key. + # The metrics are received via the API port, so the same IPs + # should also be matched at netdata.conf [web].allow connections from + allow from = * + + # The default history in entries, for all hosts using this API key. + # You can also set it per host below. + # For the default db mode (dbengine), this is ignored. + #default history = 3600 + + # The default memory mode to be used for all hosts using this API key. + # You can also set it per host below. + # If you don't set it here, the memory mode of netdata.conf will be used. + # Valid modes: + # save save on exit, load on start + # map like swap (continuously syncing to disks - you need SSD) + # ram keep it in RAM, don't touch the disk + # none no database at all (use this on headless proxies) + # dbengine like a traditional database + {{ 'default memory mode = dbengine' if 'main' in hostvars[inventory_hostname].group_names else '' }} + + # Shall we enable health monitoring for the hosts using this API key? + # 3 possible values: + # yes enable alarms + # no do not enable alarms + # auto enable alarms, only when the sending netdata is connected. + # Health monitoring will be disabled as soon as the connection is closed. + # You can also set it per host, below. + # The default is taken from [health].enabled of netdata.conf + #health enabled by default = auto + + # postpone alarms for a short period after the sender is connected + default postpone alarms on connect seconds = 60 + + # seconds of health log events to keep + #default health log history = 432000 + + # need to route metrics differently? set these. + # the defaults are the ones at the [stream] section (above) + #default proxy enabled = yes | no + #default proxy destination = IP:PORT IP:PORT ... + #default proxy api key = API_KEY + #default proxy send charts matching = * + + # Stream Compression + # By default it is enabled. + # You can control stream compression in this parent agent stream with options: yes | no + #enable compression = yes + + # select the order the compression algorithms will be used, when multiple are offered by the child + #compression algorithms order = zstd lz4 brotli gzip + + # Replication + # Enable replication for all hosts using this api key. Default: enabled + #enable replication = yes + + # How many seconds to replicate from each child. Default: a day + #seconds to replicate = 86400 + + # The duration we want to replicate per each step. + #replication_step = 600 + + # Indicate whether this child is an ephemeral node. An ephemeral node will become unavailable + # after the specified duration of "cleanup ephemeral hosts after secs" (as defined in the db section of netdata.conf) + # from the time of the node's last connection. + #is ephemeral node = false + +# ----------------------------------------------------------------------------- +# 3. PER SENDING HOST SETTINGS, ON PARENT NETDATA +# THIS IS OPTIONAL - YOU DON'T HAVE TO CONFIGURE IT + +# This section exists to give you finer control of the parent settings for each +# child host, when the same API key is used by many netdata child nodes / proxies. +# +# Each netdata has a unique GUID - generated the first time netdata starts. +# You can find it at /var/lib/netdata/registry/netdata.public.unique.id +# (at the child). +# +# The host sending data will have one. If the host is not ephemeral, +# you can give settings for each sending host here. + +[MACHINE_GUID] + # This GUID is to be used as a MACHINE GUID from remote agents connecting + # to this machine, not an API key. + # YOU MUST SET THIS FIELD ON ALL MACHINE GUIDs. + type = machine + + # enable this host: yes | no + # When disabled, the parent will not receive metrics for this host. + # THIS IS NOT A SECURITY MECHANISM - AN ATTACKER CAN SET ANY OTHER GUID. + # Use only the API key for security. + enabled = no + + # A list of simple patterns matching the IPs of the servers that + # will be pushing metrics using this MACHINE GUID. + # The metrics are received via the API port, so the same IPs + # should also be matched at netdata.conf [web].allow connections from + # and at stream.conf [API_KEY].allow from + allow from = * + + # The number of entries in the database. + # This is ignored for db mode dbengine. + #history = 3600 + + # The memory mode of the database: save | map | ram | none | dbengine + #memory mode = dbengine + + # Health / alarms control: yes | no | auto + #health enabled = auto + + # postpone alarms when the sender connects + postpone alarms on connect seconds = 60 + + # seconds of health log events to keep + #health log history = 432000 + + # need to route metrics differently? + # the defaults are the ones at the [API KEY] section + #proxy enabled = yes | no + #proxy destination = IP:PORT IP:PORT ... + #proxy api key = API_KEY + #proxy send charts matching = * + + # Stream Compression + # By default, enabled. + # You can control stream compression in this parent agent stream with options: yes | no + #enable compression = yes + + # Replication + # Enable replication for all hosts using this api key. + #enable replication = yes + + # How many seconds to replicate from each child. + #seconds to replicate = 86400 + + # The duration we want to replicate per each step. + #replication_step = 600 + + # Indicate whether this child is an ephemeral node. An ephemeral node will become unavailable + # after the specified duration of "cleanup ephemeral hosts after secs" (as defined in the db section of netdata.conf) + # from the time of the node's last connection. + #is ephemeral node = false diff --git a/roles/monitoring/vars/main/main.yml b/roles/monitoring/vars/main/main.yml new file mode 100644 index 0000000..ec30826 --- /dev/null +++ b/roles/monitoring/vars/main/main.yml @@ -0,0 +1,27 @@ +--- +# Variables for Netdata + +# Set Netdata Cloud claiming details. To find your `claim_token` and +# `claim_room`, go to Netdata Cloud, then click on your Space's name in the top +# navigation, then click on `Manage your Space`. Click on the `Nodes` tab in the +# panel that appears, which displays a script with `token` and `room` strings. +# Copy those strings into the variables below. `claim_url` should be +# `https://app.netdata.cloud`. Read more: +# https://learn.netdata.cloud/docs/agent/claim +claim_url: https://app.netdata.cloud + +# Force re-claiming of nodes to Netdata Cloud. Read more: +# https://learn.netdata.cloud/docs/agent/claim#remove-and-reclaim-a-node +reclaim: false + +# Set Netdata's metrics retention policy via the disk size for the database +# engine. Value is in MiB. Read more: +# https://learn.netdata.cloud/docs/store/change-metrics-storage +dbengine_multihost_disk_space: 2048 + +# Set whether to run the Agent web server/dashboard/API, or disable them. +# Because we're connecting this node to Netdata Cloud and will view dashboards +# there, we'll set this to `none` to disable the local dashboard. Set to +# `static-threaded` if you want to keep it running. Read more: +# https://learn.netdata.cloud/docs/configure/secure-nodes +web_mode: none diff --git a/roles/monitoring/vars/main/vault.yml b/roles/monitoring/vars/main/vault.yml new file mode 100644 index 0000000..4e9b5bf --- /dev/null +++ b/roles/monitoring/vars/main/vault.yml @@ -0,0 +1,29 @@ +claim_token: !vault | + $ANSIBLE_VAULT;1.1;AES256 + 62653264633831346161393763666666636535386239636231393831353130633138313666336435 + 6530306263613836356163376537393165633963376563390a316164373033373162646266613164 + 39646237323830626539386231313435393131363239376538383732646636303439616132353266 + 6634386563383837630a626266623337353932316666366538323835663136633930623636333131 + 66636537363731313232626666323264366464343261333633333233326165663434353136623334 + 35323261613866643139303432646537376132656237323462396237346166306666653531616462 + 61663864656130386562623136613166303462666237333230343132363864306165623631373034 + 63323666383362326431323539363633346464626163666435363236316439366338336339646636 + 62336633386438653834653361326462383234386466663335633064663638666461666365363461 + 30353735376566323861663431396164646665323563393363663637653134346130343336363631 + 336164663430653563353835336464346530 +claim_rooms: !vault | + $ANSIBLE_VAULT;1.1;AES256 + 66396339663662373339323136386635306130656365343263666235653630636663336233383162 + 6335643763383037386636376362656565383365626435370a623266636136356334396335306135 + 31346162346662383033373031653766356436343037353534383939396163333739633964636463 + 6663343665303562330a313961626165333762646136356131333466643364373038353735346462 + 65366533353733333264383534653734663932643765393863623934316461383034666137653366 + 3033636130363731343337643763336536663437343865386131 +api_key: !vault | + $ANSIBLE_VAULT;1.1;AES256 + 64626631376635366130646332386139646661386538653737383632303732653735613766613664 + 3133623739643763386537383537623837343762376265370a663837653135363732313231626664 + 33333765663039313866303665623663363062646432343539383434633631303239306664636537 + 3835643563663638360a636636393130656463353563343233373864356266363564663735373934 + 65326233656166386638616564373266393434623261653037353435373133663261353233353832 + 3234353835616133396565646439653363303133613932633065