[跟我学中小企业架构部署]之六:监控系统Nagios部署

Nagios 部署
所需软件:
nagios-3.2.2.tar.gz

解决perl编译问题

1
2
3
4
echo "export LC_ALL=C" >>/etc/profile
source /etc/profile
service iptables stop
chkconfig iptables off

#解决时间同步

1
2
/usr/sbin/ntpdate pool.ntp.org
uptime

#安装Nagios所需要的基础软件包

1
2
3
yum install gcc glibc glibc-common -y
yum install gd gd-devdel -y
yum install httpd php  –y

#安装后检查正确结果

1
rpm -qa gcc glibc glibc-common gd gd-devel httpd php

看到上图结果,说基础软件包安装完成
安装Nagios

#创建nagios需要的用户及组

1
2
3
4
5
/usr/sbin/useradd -m nagios
/usr/sbin/useradd apache -M -s /sbin/nologin
/usr/sbin/groupadd nagcmd
/usr/sbin/usermod -a -G nagcmd nagios
/usr/sbin/usermod -a -G nagcmd apache
1
2
3
4
5
6
7
8
9
10
tar zxvf nagios-3.2.2.tar.gz
cd nagios-3.2.2
./configure --with-command-group=nagcmd
make all
make install
make install-init
make install-config
make install-commandmode
make install-webconf
cd ..

#创建nagios监控界面登录用户及密码

1
htpasswd -c /usr/local/nagios/etc/htpasswd.users nagiosadmin

注:此文章中我设置 的密码是elain,后面会用到

#添加监控报警接收邮件

1
vi /usr/local/nagios/etc/objects/contacts.cfg +35

把nagios@localhost
改成 elain2012@hotmail.com

1
2
3
4
5
vi /etc/httpd/conf/httpd.conf +231   //查看是否为apache用户
vi /etc/httpd/conf/httpd.conf +265   //在下面添加ServerName 127.0.0.1:80
service httpd start
chkconfig httpd on
ps -ef |grep httpd

安装Nagios插件

1
2
3
4
5
6
7
8
9
10
11
12
13
tar zxvf nagios-plugins-1.4.15.tar.gz
cd nagios-plugins-1.4.15
./configure --with-nagios-user=nagios --with-nagios-group=nagios
--enable-perl-modules
make
make install
tar zxvf net-snmp-5.4.2.1.tar.gz
cd net-snmp-5.4.2.1
./configure
make
make install
cd ..
service nagios start

安装 nrpe

1
2
3
4
5
tar zxvf nrpe-2.12.tar.gz
cd nrpe-2.12
./configure
make all
make install-plugin

检测语法

1
/usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg

现在在浏览器输入http://10.0.0.86/nagios 登录
User: nagiosadmin
Passwd: elain

看到Nagios界面,说明初步安装成功!!!
注:此处有一个警告,是应该Nagios服务端上的apache没有首页,添加一个首页就没此警告了!

配置nagios监控服务
――――――――――――――――

1
vi /usr/local/nagios/etc/nagios.cfg +33

加到第 33行下面:

1
2
cfg_file=/usr/local/nagios/etc/objects/host.cfg
cfg_file=/usr/local/nagios/etc/objects/service.cfg

建立上面两个文件

1
2
touch /usr/local/nagios/etc/objects/host.cfg
touch /usr/local/nagios/etc/objects/service.cfg

检查语法:

1
/etc/init.d/nagios checkconfig

服务端配置完毕!!!

被监控端配置(LAMP,LNMP,LB1,LB2,DB1,DB2,BAK配置基本一样,以LAMP为例)
LAMP
被监控端所需软件:

1
2
zxvf nagios-plugins-1.4.15.tar.gz
zxvf nrpe-2.12.tar.gz

解决perl编译问题

1
2
3
4
5
6
echo "export LC_ALL=C" >> /etc/profile
source /etc/profile
service iptables stop
chkconfig iptables off
usr/sbin/ntpdate pool.ntp.org
uptime

#创建nagios需要的用户及组

1
/usr/sbin/adduser nagios –M

上传软件:

1
2
3
4
5
6
7
8
tar zxvf nagios-plugins-1.4.15.tar.gz
cd nagios-plugins-1.4.15
./configure --prefix=/usr/local/nagios --enable-perl-modules
--enable-redhat-pthread-workaround
make
make install
cd ..
ls /usr/local/nagios/libexec/ |wc –l                 //查看安装的插件

安装 nrpe

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
tar zxvf nrpe-2.12.tar.gz
cd nrpe-2.12
./configure
make all
make install-plugin
make install-daemon
make install-daemon-config
cd ..
#安装相关的插件
tar zxvf Nagios-Plugin-0.34.tar.gz
cd Nagios-Plugin-0.34
perl Makefile.PL
make
make install
cd ..

#配置nrpe

1
2
3
cd /usr/local/nagios/etc
vi nrpe.cfg +79
allowed_hosts=127.0.0.1,10.0.10.86    //添加可以监控该服务端的NagiosServer的IP

注:在这里可以自己写一此插件,放到插件目录,给可执行权限,然后在此文件中引用此插件,在服务器端的相应服务中引用此定义即可!

1
2
3
/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg –d        //启动nrpe
echo "/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d" >>/etc/rc.local
ps -ef|grep nagios

重启nagios nrpe组合命令

1
pkill nrpe && /usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d

到此被监控端LAMP配置完成,其它被监控端配置一样,就不一一写了。

监控服务端添加主机配置
在commands.cfg 中加入 check_nrpe 的插件配置

1
2
3
4
5
6
7
8
cd /usr/local/nagios/etc/objects
cp commands.cfg  commands.cfg.ori                 //把原始配置文件做好备份
vi commands.cfg  结尾加入下面内容
# ''''check_nrpe'''' command definition
define command{
command_name       check_nrpe
command_line       $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
}

检查语法:

1
/etc/init.d/nagios checkconfig

使配置文件生效命令

1
2
/etc/init.d/nagios reload
/usr/local/nagios/libexec/check_nrpe -H localhost

察看是否能返回nrpe版本号,返回则正常!

#允许防火墙通过(实验环境,直接关闭防火墙即可)

1
2
3
iptables -A INPUT -p udp --dport 5666 -j ACCEPT
iptables -A INPUT -s 10.0.10.0/24 -p tcp -m tcp -j ACCEPT
iptables -A INPUT -s 10.0.10.0/24 -p udp -m udp -j ACCEPT

在host.cfg里添加被监控主机:
host.cfg (也可以每台服务器IP地址命名,需在nagios.cfg中添加)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

define host {
use                           linux-server
host_name                     lamp             //被监控主机名称
alias                          web-server
address                       10.0.10.83         //被监控主机IP
check_command                check-host-alive
max_check_attempts             3
normal_check_interval           2
retry_check_interval             2
check_period                   24x7
notification_interval             300
notification_period              24x7
notification_options              d,u,r
contact_groups                 admins
}

注:每添加一台,需在host.cfg里追加如上内容,修改上面注释的两行即可。

service.cfg

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76

define service {
use                             generic-service
host_name                       lamp
service_description                Current Load
check_command                  check_nrpe!check_load
max_check_attempts               2
normal_check_interval             4
retry_check_interval               4
check_period                     24x7
notification_interval               1440
notification_period                24x7
notification_options              w,u,c,r
contact_groups                  admins
process_perf_data                1
}
define service {
use                              generic-service
host_name                        lamp
service_description                 MEM Useage
check_command                    check_nrpe!check_mem
max_check_attempts                 2
normal_check_interval                4
retry_check_interval                  4
check_period                        24x7
notification_interval                  1440
notification_period                   24x7
notification_options                   w,u,c,r
contact_groups                       admins
process_perf_data                    1
}
define service {
use                               generic-service
host_name                         lamp
service_description                  Swap Useage
check_command                    check_nrpe!check_swap
max_check_attempts                 10
normal_check_interval                3
retry_check_interval                  4
check_period                        24x7
notification_interval                  480
notification_period                   workhours
notification_options                  w,u,c,r
contact_groups                      admins
process_perf_data                    1
}
define service {
use                               generic-service
host_name                         lamp
service_description                  Disk Partition
check_command                    check_nrpe!check_disk
max_check_attempts                 8
normal_check_interval                3
retry_check_interval                  2
check_period                        24x7
notification_interval                  360
notification_period                   24x7
notification_options                  w,u,c,r
contact_groups                      admins
process_perf_data                    1
}
define service {
use                                generic-service
host_name                          lamp
service_description                 HTTP
check_command                   check_http
max_check_attempts                2
normal_check_interval              4
retry_check_interval                4
check_period                     24x7
notification_interval                1440
notification_period                 24x7
notification_options                w,u,c,r
contact_groups                    admins
process_perf_data                  1
}

注:上面只是添加了基本的一些监控,可根据不同的服务器选择所需的监控。

检测语法

1
/usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg

表示配置正确
service nagios restart
访问http://10.0.10.86/nagios 查看添加的主机

注:收集信息需几分钟,如果一开始访问没有看到属正常,等几分钟就好了!
其它服务器的配置就不一一写了,全部和LAMP一样,可根据不同的添加或删除被监控项!!

报警方法配置请参考本人的另一篇文章:

CentOS下nagios报警飞信部署四步走 http://www.elain.org/?p=467

文章目录
,