|
| 1 | +node-exporter的工作原理太简单了,简单到市面上无法找到一本知名的书籍来讲解node-exporter的工作原理,但这并不意味着node-exporter不重要,相反基本上有prometheus的地方都会存在node-exporter |
| 2 | + |
| 3 | +## Collectors |
| 4 | +通过提供 `--collector.<name>` 标志可以启用收集器。默认启用的收集器可以通过提供 `--no-collector.<name>` 标志来禁用。如需仅启用部分特定收集器,请使用 `--collector.disable-defaults --collector.<name> ...` 。 |
| 5 | + |
| 6 | +如果嫌指标太多,可以通过 `./node_exporter -h` 查看支持的flags参数 |
| 7 | + |
| 8 | + |
| 9 | +### Include & Exclude flags |
| 10 | + |
| 11 | +A few collectors can be configured to include or exclude certain patterns using dedicated flags. The exclude flags are used to indicate "all except", while the include flags are used to say "none except". Note that these flags are mutually exclusive on collectors that support both. |
| 12 | + |
| 13 | + |
| 14 | +```txt |
| 15 | +--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|v |
| 16 | +``` |
| 17 | + |
| 18 | +| Collector | Scope | Include Flag | Exclude Flag | |
| 19 | +| ---------- | ------------ | ------------------------------------------- | ------------------------------------------- | |
| 20 | +| arp | device | --collector.arp.device-include | --collector.arp.device-exclude | |
| 21 | +| cpu | bugs | --collector.cpu.info.bugs-include | N/A | |
| 22 | +| cpu | flags | --collector.cpu.info.flags-include | N/A | |
| 23 | +| diskstats | device | --collector.diskstats.device-include | --collector.diskstats.device-exclude | |
| 24 | +| ethtool | device | --collector.ethtool.device-include | --collector.ethtool.device-exclude | |
| 25 | +| ethtool | metrics | --collector.ethtool.metrics-include | N/A | |
| 26 | +| filesystem | fs-types | --collector.filesystem.fs-types-include | --collector.filesystem.fs-types-exclude | |
| 27 | +| filesystem | mount-points | --collector.filesystem.mount-points-include | --collector.filesystem.mount-points-exclude | |
| 28 | +| hwmon | chip | --collector.hwmon.chip-include | --collector.hwmon.chip-exclude | |
| 29 | +| hwmon | sensor | --collector.hwmon.sensor-include | --collector.hwmon.sensor-exclude | |
| 30 | +| interrupts | name | --collector.interrupts.name-include | --collector.interrupts.name-exclude | |
| 31 | +| netdev | device | --collector.netdev.device-include | --collector.netdev.device-exclude | |
| 32 | +| qdisk | device | --collector.qdisk.device-include | --collector.qdisk.device-exclude | |
| 33 | +| slabinfo | slab-names | --collector.slabinfo.slabs-include | --collector.slabinfo.slabs-exclude | |
| 34 | +| sysctl | all | --collector.sysctl.include | N/A | |
| 35 | +| systemd | unit | --collector.systemd.unit-include | --collector.systemd.unit-exclude | |
| 36 | +### Enabled by default |
| 37 | + |
| 38 | +Name | Description | OS |
| 39 | +---------|-------------|---- |
| 40 | +arp | Exposes ARP statistics from `/proc/net/arp`. | Linux |
| 41 | +bcache | Exposes bcache statistics from `/sys/fs/bcache/`. | Linux |
| 42 | +bonding | Exposes the number of configured and active slaves of Linux bonding interfaces. | Linux |
| 43 | +btrfs | Exposes btrfs statistics | Linux |
| 44 | +boottime | Exposes system boot time derived from the `kern.boottime` sysctl. | Darwin, Dragonfly, FreeBSD, NetBSD, OpenBSD, Solaris |
| 45 | +conntrack | Shows conntrack statistics (does nothing if no `/proc/sys/net/netfilter/` present). | Linux |
| 46 | +cpu | Exposes CPU statistics | Darwin, Dragonfly, FreeBSD, Linux, Solaris, OpenBSD |
| 47 | +cpufreq | Exposes CPU frequency statistics | Linux, Solaris |
| 48 | +diskstats | Exposes disk I/O statistics. | Darwin, Linux, OpenBSD |
| 49 | +dmi | Expose Desktop Management Interface (DMI) info from `/sys/class/dmi/id/` | Linux |
| 50 | +edac | Exposes error detection and correction statistics. | Linux |
| 51 | +entropy | Exposes available entropy. | Linux |
| 52 | +exec | Exposes execution statistics. | Dragonfly, FreeBSD |
| 53 | +fibrechannel | Exposes fibre channel information and statistics from `/sys/class/fc_host/`. | Linux |
| 54 | +filefd | Exposes file descriptor statistics from `/proc/sys/fs/file-nr`. | Linux |
| 55 | +filesystem | Exposes filesystem statistics, such as disk space used. | Darwin, Dragonfly, FreeBSD, Linux, OpenBSD |
| 56 | +hwmon | Expose hardware monitoring and sensor data from `/sys/class/hwmon/`. | Linux |
| 57 | +infiniband | Exposes network statistics specific to InfiniBand and Intel OmniPath configurations. | Linux |
| 58 | +ipvs | Exposes IPVS status from `/proc/net/ip_vs` and stats from `/proc/net/ip_vs_stats`. | Linux |
| 59 | +loadavg | Exposes load average. | Darwin, Dragonfly, FreeBSD, Linux, NetBSD, OpenBSD, Solaris |
| 60 | +mdadm | Exposes statistics about devices in `/proc/mdstat` (does nothing if no `/proc/mdstat` present). | Linux |
| 61 | +meminfo | Exposes memory statistics. | Darwin, Dragonfly, FreeBSD, Linux, OpenBSD |
| 62 | +netclass | Exposes network interface info from `/sys/class/net/` | Linux |
| 63 | +netdev | Exposes network interface statistics such as bytes transferred. | Darwin, Dragonfly, FreeBSD, Linux, OpenBSD |
| 64 | +netisr | Exposes netisr statistics | FreeBSD |
| 65 | +netstat | Exposes network statistics from `/proc/net/netstat`. This is the same information as `netstat -s`. | Linux |
| 66 | +nfs | Exposes NFS client statistics from `/proc/net/rpc/nfs`. This is the same information as `nfsstat -c`. | Linux |
| 67 | +nfsd | Exposes NFS kernel server statistics from `/proc/net/rpc/nfsd`. This is the same information as `nfsstat -s`. | Linux |
| 68 | +nvme | Exposes NVMe info from `/sys/class/nvme/` | Linux |
| 69 | +os | Expose OS release info from `/etc/os-release` or `/usr/lib/os-release` | _any_ |
| 70 | +powersupplyclass | Exposes Power Supply statistics from `/sys/class/power_supply` | Linux |
| 71 | +pressure | Exposes pressure stall statistics from `/proc/pressure/`. | Linux (kernel 4.20+ and/or [CONFIG\_PSI](https://www.kernel.org/doc/html/latest/accounting/psi.html)) |
| 72 | +rapl | Exposes various statistics from `/sys/class/powercap`. | Linux |
| 73 | +schedstat | Exposes task scheduler statistics from `/proc/schedstat`. | Linux |
| 74 | +selinux | Exposes SELinux statistics. | Linux |
| 75 | +sockstat | Exposes various statistics from `/proc/net/sockstat`. | Linux |
| 76 | +softnet | Exposes statistics from `/proc/net/softnet_stat`. | Linux |
| 77 | +stat | Exposes various statistics from `/proc/stat`. This includes boot time, forks and interrupts. | Linux |
| 78 | +tapestats | Exposes statistics from `/sys/class/scsi_tape`. | Linux |
| 79 | +textfile | Exposes statistics read from local disk. The `--collector.textfile.directory` flag must be set. | _any_ |
| 80 | +thermal | Exposes thermal statistics like `pmset -g therm`. | Darwin |
| 81 | +thermal\_zone | Exposes thermal zone & cooling device statistics from `/sys/class/thermal`. | Linux |
| 82 | +time | Exposes the current system time. | _any_ |
| 83 | +timex | Exposes selected adjtimex(2) system call stats. | Linux |
| 84 | +udp_queues | Exposes UDP total lengths of the rx_queue and tx_queue from `/proc/net/udp` and `/proc/net/udp6`. | Linux |
| 85 | +uname | Exposes system information as provided by the uname system call. | Darwin, FreeBSD, Linux, OpenBSD |
| 86 | +vmstat | Exposes statistics from `/proc/vmstat`. | Linux |
| 87 | +watchdog | Exposes statistics from `/sys/class/watchdog` | Linux |
| 88 | +xfs | Exposes XFS runtime statistics. | Linux (kernel 4.4+) |
| 89 | +zfs | Exposes [ZFS](http://open-zfs.org/) performance statistics. | FreeBSD, [Linux](http://zfsonlinux.org/), Solaris |
| 90 | + |
| 91 | +### Disabled by default |
| 92 | + |
| 93 | +`node_exporter` also implements a number of collectors that are disabled by default. Reasons for this vary by |
| 94 | +collector, and may include: |
| 95 | +* High cardinality |
| 96 | +* Prolonged runtime that exceeds the Prometheus `scrape_interval` or `scrape_timeout` |
| 97 | +* Significant resource demands on the host |
| 98 | + |
| 99 | +You can enable additional collectors as desired by adding them to your |
| 100 | +init system's or service supervisor's startup configuration for |
| 101 | +`node_exporter` but caution is advised. Enable at most one at a time, |
| 102 | +testing first on a non-production system, then by hand on a single |
| 103 | +production node. When enabling additional collectors, you should |
| 104 | +carefully monitor the change by observing the ` |
| 105 | +scrape_duration_seconds` metric to ensure that collection completes |
| 106 | +and does not time out. In addition, monitor the |
| 107 | +`scrape_samples_post_metric_relabeling` metric to see the changes in |
| 108 | +cardinality. |
| 109 | + |
| 110 | +| Name | Description | OS | |
| 111 | +| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------ | |
| 112 | +| buddyinfo | Exposes statistics of memory fragments as reported by /proc/buddyinfo. | Linux | |
| 113 | +| cgroups | A summary of the number of active and enabled cgroups | Linux | |
| 114 | +| cpu\_vulnerabilities | Exposes CPU vulnerability information from sysfs. | Linux | |
| 115 | +| devstat | Exposes device statistics | Dragonfly, FreeBSD | |
| 116 | +| drm | Expose GPU metrics using sysfs / DRM, `amdgpu` is the only driver which exposes this information through DRM | Linux | |
| 117 | +| drbd | Exposes Distributed Replicated Block Device statistics (to version 8.4) | Linux | |
| 118 | +| ethtool | Exposes network interface information and network driver statistics equivalent to `ethtool`, `ethtool -S`, and `ethtool -i`. | Linux | |
| 119 | +| interrupts | Exposes detailed interrupts statistics. | Linux, OpenBSD | |
| 120 | +| ksmd | Exposes kernel and system statistics from `/sys/kernel/mm/ksm`. | Linux | |
| 121 | +| lnstat | Exposes stats from `/proc/net/stat/`. | Linux | |
| 122 | +| logind | Exposes session counts from [logind](http://www.freedesktop.org/wiki/Software/systemd/logind/). | Linux | |
| 123 | +| meminfo\_numa | Exposes memory statistics from `/sys/devices/system/node/node[0-9]*/meminfo`, `/sys/devices/system/node/node[0-9]*/numastat`. | Linux | |
| 124 | +| mountstats | Exposes filesystem statistics from `/proc/self/mountstats`. Exposes detailed NFS client statistics. | Linux | |
| 125 | +| network_route | Exposes the routing table as metrics | Linux | |
| 126 | +| perf | Exposes perf based metrics (Warning: Metrics are dependent on kernel configuration and settings). | Linux | |
| 127 | +| processes | Exposes aggregate process statistics from `/proc`. | Linux | |
| 128 | +| qdisc | Exposes [queuing discipline](https://en.wikipedia.org/wiki/Network_scheduler#Linux_kernel) statistics | Linux | |
| 129 | +| slabinfo | Exposes slab statistics from `/proc/slabinfo`. Note that permission of `/proc/slabinfo` is usually 0400, so set it appropriately. | Linux | |
| 130 | +| softirqs | Exposes detailed softirq statistics from `/proc/softirqs`. | Linux | |
| 131 | +| sysctl | Expose sysctl values from `/proc/sys`. Use `--collector.sysctl.include(-info)` to configure. | Linux | |
| 132 | +| systemd | Exposes service and system status from [systemd](http://www.freedesktop.org/wiki/Software/systemd/). | Linux | |
| 133 | +| tcpstat | Exposes TCP connection status information from `/proc/net/tcp` and `/proc/net/tcp6`. (Warning: the current version has potential performance issues in high load situations.) | Linux | |
| 134 | +| wifi | Exposes WiFi device and station statistics. | Linux | |
| 135 | +| xfrm | Exposes statistics from `/proc/net/xfrm_stat` | Linux | |
| 136 | +| zoneinfo | Exposes NUMA memory zone metrics. | Linux | |
| 137 | + |
| 138 | +## 配置 |
| 139 | +kube-prometheus中node-exporter的配置如下 |
| 140 | +```yaml |
| 141 | +apiVersion: apps/v1 |
| 142 | +# 使用DeamonSet类型,确保每个node上只有一个pod运行 |
| 143 | +kind: DaemonSet |
| 144 | +metadata: |
| 145 | + labels: |
| 146 | + app.kubernetes.io/component: exporter |
| 147 | + app.kubernetes.io/name: node-exporter |
| 148 | + app.kubernetes.io/part-of: kube-prometheus |
| 149 | + app.kubernetes.io/version: 1.1.2 |
| 150 | + name: node-exporter |
| 151 | + namespace: monitor |
| 152 | +spec: |
| 153 | + selector: |
| 154 | + matchLabels: |
| 155 | + app.kubernetes.io/component: exporter |
| 156 | + app.kubernetes.io/name: node-exporter |
| 157 | + app.kubernetes.io/part-of: kube-prometheus |
| 158 | + template: |
| 159 | + metadata: |
| 160 | + labels: |
| 161 | + app.kubernetes.io/component: exporter |
| 162 | + app.kubernetes.io/name: node-exporter |
| 163 | + app.kubernetes.io/part-of: kube-prometheus |
| 164 | + app.kubernetes.io/version: 1.1.2 |
| 165 | + spec: |
| 166 | + # 调度优先级为 **高优先级的 PriorityClass** - 在资源不足时,低优先级的 Pod 可能会被**驱逐(evicted)**,以腾出资源给高优先级 Pod。 |
| 167 | + priorityClassName: system-cluster-critical |
| 168 | + containers: |
| 169 | + - args: |
| 170 | + # 只监听 127.0.0.1 eth0的 ip 留给代理进行监听,这样就能只使用一个端口,代理和node-export使用一个端口 |
| 171 | + - --web.listen-address=127.0.0.1:9100 |
| 172 | + # 在容器环境中,会将 /sys 挂载到 /host/sys |
| 173 | + - --path.sysfs=/host/sys |
| 174 | + # 在容器中将 / 挂载到目录 /host/root |
| 175 | + - --path.rootfs=/host/root |
| 176 | + # wifi相关数据不采集 |
| 177 | + - --no-collector.wifi |
| 178 | + # 硬件数据采集被禁止,容器环境中没有特权很多容器内部的数据无法采集 |
| 179 | + - --no-collector.hwmon |
| 180 | + # 部分文件挂载点不被采集 |
| 181 | + - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) |
| 182 | + # veth开头的网卡不采集数据,这些网卡时虚拟网卡 |
| 183 | + - --collector.netclass.ignored-devices=^(veth.*)$ |
| 184 | + # 已经采集的数据中排除对应的指标 |
| 185 | + - --collector.netdev.device-exclude=^(veth.*)$ |
| 186 | + image: HARBOR_ADDR/quay.io/prometheus/node-exporter:v1.1.2 |
| 187 | + name: node-exporter |
| 188 | + resources: |
| 189 | + limits: |
| 190 | + cpu: 250m |
| 191 | + memory: 180Mi |
| 192 | + requests: |
| 193 | + cpu: 102m |
| 194 | + memory: 180Mi |
| 195 | + volumeMounts: |
| 196 | + - mountPath: /host/sys |
| 197 | + mountPropagation: HostToContainer |
| 198 | + name: sys |
| 199 | + readOnly: true |
| 200 | + - mountPath: /host/root |
| 201 | + mountPropagation: HostToContainer |
| 202 | + name: root |
| 203 | + readOnly: true |
| 204 | + - args: |
| 205 | + - --logtostderr |
| 206 | + # k8s 实现的环境变量注入,当启动过程中,k8s会根据需要从 fieldRef 将变量值提取出来,并进行替换 |
| 207 | + - --secure-listen-address=[$(IP)]:9100 |
| 208 | + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 |
| 209 | + # 监听的数据流 |
| 210 | + - --upstream=http://127.0.0.1:9100/ |
| 211 | + env: |
| 212 | + - name: IP |
| 213 | + valueFrom: |
| 214 | + fieldRef: |
| 215 | + fieldPath: status.podIP |
| 216 | + image: HARBOR_ADDR/quay.io/brancz/kube-rbac-proxy:v0.8.0 |
| 217 | + name: kube-rbac-proxy |
| 218 | + ports: |
| 219 | + - containerPort: 9100 |
| 220 | + hostPort: 9100 |
| 221 | + name: https |
| 222 | + resources: |
| 223 | + limits: |
| 224 | + cpu: 20m |
| 225 | + memory: 40Mi |
| 226 | + requests: |
| 227 | + cpu: 10m |
| 228 | + memory: 20Mi |
| 229 | + securityContext: |
| 230 | + runAsGroup: 65532 |
| 231 | + runAsNonRoot: true |
| 232 | + runAsUser: 65532 |
| 233 | + hostNetwork: true |
| 234 | + hostPID: true |
| 235 | + nodeSelector: |
| 236 | + kubernetes.io/os: linux |
| 237 | + securityContext: |
| 238 | + runAsNonRoot: true |
| 239 | + runAsUser: 65534 |
| 240 | + serviceAccountName: node-exporter |
| 241 | + tolerations: |
| 242 | + - operator: Exists |
| 243 | + volumes: |
| 244 | + - hostPath: |
| 245 | + path: /sys |
| 246 | + name: sys |
| 247 | + - hostPath: |
| 248 | + path: / |
| 249 | + name: root |
| 250 | + updateStrategy: |
| 251 | + rollingUpdate: |
| 252 | + maxUnavailable: 10% |
| 253 | + type: RollingUpdate |
| 254 | +``` |
| 255 | +
|
| 256 | +
|
0 commit comments