influxdb.yaml
- # 持久化存储配置
- persistence:
- enabled: true
- useExisting: true
- name: "influxdb-pvc" # 使用我们刚才创建的 PVC
- accessMode: "ReadWriteOnce"
- size: 200Gi
- # 创建 Prometheus 的数据库
- env:
- - name: INFLUXDB_DB
- value: "prometheus"
- # influxdb 配置
- config:
- data:
- # 这两个配置默认限制了数据的上限,建议设置为 0 变成无限制,不然在达到上限后插入数据会返回错误
- max_series_per_database: 0
- max_values_per_tag: 0
- http:
- enabled: true # 启动 http
- initScripts:
- enabled: true
- scripts:
- # 设置数据保留策略,默认是永不失效,需要人工清理
- # 保留 180 天数据
- retention.iql: |+
- CREATE RETENTION POLICY "default_retention_policy" on "prometheus" DURATION 180d REPLICATION 1 DEFAULT
InfluxDB 的全部配置可以参考文档,我讲一下上面的两个主要的配置。
max-series-per-database
内存中每个数据库最大的序列数量,默认是 1000000,设置为 0 改成无限制。如果新来的数据增加了序列数量并超过了这个上限,那么数据就会被丢弃就并返回一个 500 错误:
- {"error":"max series per database exceeded: <series>"}
max-values-per-tag
内存中每个标签的最大数据量,默认是 100000,设置为 0 改成无限制。如果新来的数据超过了这个限制,也会被丢弃并返回写入失败的错误。
我们使用如下命令来部署 InfluxDB:
- helm install --name=influxdb --namespace=monitoring -f influxdb.yaml stable/influxdb
存储后端部署成功后,我们就来部署 Prometheus-operator 了,首先创建如下的配置文件
prometheus.yaml
- # prometheus 服务端
- prometheus:
- prometheusSpec:
- # 远端存储配置
- remoteWrite:
- - url: "http://influxdb:8086/api/v1/prom/write?db=prometheus"
- remoteRead:
- - url: "http://influxdb:8086/api/v1/prom/read?db=prometheus"
- # ingress 配置,暴露 web 界面
- ingress:
- enabled: true
- annotations:
- kubernetes.io/ingress.class: traefik # ingress class
- hosts:
- - "prometheus.mydomain.io" # 配置域名
- alertmanager:
- # alertmanager 配置
- config:
- global:
- # SMTP 配置
- smtp_smarthost: 'xxx'
- smtp_from: 'xxx'
- smtp_auth_username: 'xxx'
- smtp_auth_password: 'xxx'
- # 全局 opsgenie 配置
- # opsgenie_api_key: ""
- # 报警路由
- route:
- receiver: 'monitoring-warning'
- group_by: ['alertname']
- group_wait: 30s
- group_interval: 3m
- repeat_interval: 8h
- routes:
- - match:
- severity: critical
- receiver: monitoring-critical
- group_by: ['alertname']
- - match:
- severity: warning
- receiver: monitoring-warning
- group_by: ['alertname']
- # 报警抑制规则
- inhibit_rules:
- - source_match:
- severity: 'critical'
- target_match:
- severity: 'warning'
- # 抑制相同的报警
- equal: ['alertname']
- # 接收者配置
- receivers:
- - name: 'monitoring-critical'
- email_configs:
- - to: 'monitor@mydomain.com'
- # 发送到钉钉的 webhook,需要部署一个转发服务,详见项目代码
- webhook_configs:
- - send_resolved: true
- url: http://prometheus-webhook-dingtalk/dingtalk/monitoring/send
- - name: 'monitoring-warning'
- email_configs:
- - to: 'monitor@mydomain.com'
- alertmanagerSpec:
- # alertmanager 存储配置,alertmanager 会以文件形式存储报警静默等配置
- storage:
- volumeClaimTemplate:
- spec:
- accessModes:
- - ReadWriteOnce
- storageClassName: monitor-ebs # 选择合适的存储类
- resources:
- requests:
- storage: 20Gi # 选择合适的大小
- # ingress 配置,暴露 alert 的界面
- ingress:
- enabled: true
- annotations:
- kubernetes.io/ingress.class: traefik # ingress class
- hosts:
- - "alert.mydomain.io" # 配置域名
- # grafana 配置
- grafana:
- replicas: 1
- adminPassword: "admin" # 管理员账户 admin,密码 admin
- env:
- # GF_SERVER_DOMAIN: "" # 域名
- GF_SERVER_ROOT_URL: "%(protocol)s://%(domain)s/"
- # GF_DATABASE_URL: "mysql://user:secret@host:port/database" # SQL 数据库
- # ingress 配置,暴露界面
- ingress:
- enabled: true
- annotations:
- kubernetes.io/ingress.class: traefik # ingress class
- hosts:
- - "grafana.mydomain.io" # 设置域名
- # exporter 设置,自建集群需要开启,如果是云服务商托管集群,则获取不到这些信息,可以关闭
- kubeControllerManager:
- enabled: true
- kubeEtcd:
- enabled: true
- kubeScheduler:
- enabled: true
(编辑:ASP站长网)
|