本文最后更新于 1211 天前,其中的信息可能已经有所发展或是发生改变。
1.需要配置两个文件
config.conf:
#gp-master信息
[gp_server_info]
host=127.0.0.1
port=29001
username=gpadmin
password=111
#tomcat-nginx信息
[tomcat_nginx_server_info]
host=127.0.0.1
port=29001
username=gpadmin
password=111
#维护页面信息
[nbd_server_info]
login_url=http://ip/nbdGD/bigdata/user/login
check_all_url=http://ip/nbdGD/bigdata/monitor/gp
service_status_url=http://ip/nbdGD/bigdata/monitor/congestion
mr_service_url=http://ip/nbdGD/bigdata/monitor/mrsender
username=admin
password=111
#CDH信息
[cdh_server_info]
host=127.0.0.1
port=57182
username=admin
password=1111
cluster_name=Cluster 1
server_info.json 服务器信息
type:是区分数据库的使用类型,如gp是master节点,mr是代表采集mr服务器,KPI是代表采集KPI服务器
[
{
"host": "127.0.0.1",
"username": "root",
"password": "111!@#",
"port": 29001,
"type": "gp"
},
{
"host": "127.0.0.1",
"username": "root",
"password": "111!@#",
"port": 29001,
"type": "sdw"
},
{
"host": "127.0.0.1",
"username": "root",
"password": "111!@#",
"port": 29001,
"type": "mr"
},
{
"host": "127.0.0.1",
"username": "root",
"password": "111!@#",
"port": 29001,
"type": "kpi"
}
]
2.主代码如下:
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
import configparser
import datetime
import json
import logging
import os
import platform
import random
import re
import sys
import time
import paramiko
import requests
"""
全局函数区域...
"""
banner = """\033[1;34m
////////////////////////////////////////////////////////////////////
// _ooOoo_ //
// o8888888o //
// 88" . "88 //
// (| ^_^ |) //
// O\ = /O //
// ____/`---'\____ //
// .' \\| |// `. //
// / \\||| : |||// \ //
// / _||||| -:- |||||- \ //
// | | \\\ - /// | | //
// | \_| ''\---/'' | | //
// \ .-\__ `-` ___/-. / //
// ___`. .' /--.--\ `. . ___ //
// ."" '< `.___\_<|>_/___.' >'"". //
// | | : `- \`.;`\ _ /`;.`/ - ` : | | //
// \ \ `-. \_ __\ /__ _/ .-` / / //
// ========`-.____`-.___\_____/___.-`____.-'======== //
// `=---=' //
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ //
// 佛祖保佑 永不宕机 永无BUG //
////////////////////////////////////////////////////////////////////
linux自动化巡检工具 Version:1.0
Company:hcxt Author:hcxt
"""
print(banner)
# 系统类型
system_type = platform.system()
# 当前路径
work_dir = os.getcwd()
# 日志目录
log_path = os.path.join(work_dir, 'logs')
# 当前时间
cur_day = datetime.datetime.now()
cur_day_hour_str = datetime.datetime.strftime(cur_day, "%Y%m%d%H")
cur_day_str = datetime.datetime.strftime(cur_day, "%Y%m%d")
conf_file_path = './config/config.conf'
server_info_json_path = './config/server_info.json'
user_Agent_list = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.74",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Opera/9.80 (Windows NT 10.0; U; zh-cn) Presto/2.9.168 Version/11.50",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36"
]
def read_nbd_config_info():
"""
读取维护页面配置信息
:return:
"""
section = 'nbd_server_info'
conf_file = conf_file_path
cf = configparser.RawConfigParser()
cf.read(conf_file, encoding='utf-8')
login_url = cf.get(section, 'login_url')
check_all_url = cf.get(section, 'check_all_url')
service_status_url = cf.get(section, 'service_status_url')
mr_service_url = cf.get(section, 'mr_service_url')
username = cf.get(section, 'username')
password = cf.get(section, 'password')
info = {"login_url": login_url, "check_all_url": check_all_url,
"mr_service_url": mr_service_url, "service_status_url": service_status_url,
"username": username, "password": password}
cf.clear()
return info
def get_nbd_login_url_info():
"""
登录获取ak-cookies
:return:
"""
nbd_info = read_nbd_config_info()
login_url = nbd_info['login_url']
username = nbd_info['username']
password = nbd_info['password']
try:
headers = {
"Content-Type": "application/x-www-form-urlencoded",
"Accept": "*/*",
"Connection": "keep-alive",
"User-Agent": random.choice(user_Agent_list),
}
login_data = {'username': username, 'password': password}
req = requests.post(login_url, headers=headers, timeout=200, data=login_data)
response = req.text
if req.ok:
logging.info('成功登录维护页面,获取数据中.....')
return json.loads(response)
else:
return None
req.close()
except Exception as e:
logging.error('请求出错啦...')
return None
def get_nbd():
"""
查询获取天粒度入库数据
:return:
"""
logging.info("#####正在获取天粒度入库数据,请耐心等待#####\n")
login_data = get_nbd_login_url_info()
if login_data is not None:
if login_data['success'] == 1:
ak = login_data['ak']
nbd_info = read_nbd_config_info()
check_all_url = nbd_info['check_all_url']
username = nbd_info['username']
service_status_url = nbd_info['service_status_url']
mr_service_url = nbd_info['mr_service_url']
try:
link = check_all_url + '?username={0}&usercity=ALL&ak={1}'.format(username, ak)
req = requests.get(link, timeout=100)
response = req.text
data_json = json.loads(response)
# logging.info("#####正在获取天粒度入库数据,请耐心等待#####\n")
if data_json is not None and len(data_json) > 0:
day_infos = ['mr_nr_day', 'mr_lte_day', 'kpi_lte_day', 'kpi_lte_nsa_day',
'kpi_nr_nsa_du_day', 'kpi_nr_nsa_cu_day', 'kpi_nr_sa_du_day', 'kpi_nr_sa_cu_day']
for data in data_json:
for info in day_infos:
if data[info] is not None and len(data[info]) > 0:
logging.info(info+':'+str(data[info]))
else:
logging.info(info + ':{}')
logging.info("\n#####正在获取本地平台当前程序状态数据,请耐心等待#####\n")
link = service_status_url + '?username={0}&usercity=ALL&ak={1}'.format(username, ak)
req = requests.get(link, timeout=100)
response = req.text
data_json = json.loads(response)
if data_json is not None and data_json['Datas'] is not None:
# type_info = ['mr', 'nrm', 'mdt', 'pm']
# for t in type_info:
# logging.info('待处理' + t + ':' + str(data_json['Datas'][t]) + '文件')
for key in data_json['Datas']:
logging.info('待处理' + key + ':' + str(data_json['Datas'][key]) + '文件')
logging.info("\n#####正在获取MR服务器今日处理MR小区数 ,请耐心等待#####\n")
link = mr_service_url + '?username={0}&usercity=ALL&ak={1}'.format(username, ak)
req = requests.get(link, timeout=100)
response = req.text
data_json = json.loads(response)
if data_json is not None and data_json['Datas'] is not None:
for key in sorted(data_json['Datas'], key=None, reverse=False):
logging.info('服务器ip:' + key + '-|-今日处理小区数:' + str(data_json['Datas'][key]))
req.close()
except Exception as e:
print(e)
logging.error("请求失败...")
else:
logging.error('登录失败请检查账号密码...')
else:
logging.error('访问失败,请检查维护页面程序是否正常...')
def mkdir(path):
"""
本函数用于创建目录,如果没有,则创建,有则什么也不做
:param path: 需要创建的目录
:return:True或False。如果目录已存在,则返回False,如果成功创建目录,则返回True.
"""
path = path.strip()
is_exists = os.path.exists(path)
if not is_exists:
os.makedirs(path)
return True
else:
return False
# 设置日志
mkdir(log_path)
# log_format = "%(asctime)s - %(levelname)s - %(message)s"
# log_format = "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s"
log_file = os.path.join(log_path, "%s.log" % cur_day_hour_str)
# logging.basicConfig(filename=log_file, level=logging.DEBUG, format=log_format)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter()
fh = logging.FileHandler(os.path.join(log_path, log_file))
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.addHandler(fh)
logger.info("检查时间: %s" % datetime.datetime.strftime(cur_day, "%Y%m%d %H:%M:%S"))
def read_config_info(section):
"""
读取配置文件的服务器信息
:param section:属于哪个部分
:return:返回服务器信息
"""
conf_file = conf_file_path
cf = configparser.RawConfigParser()
cf.read(conf_file, encoding='utf-8')
host = cf.get(section, 'host')
port = cf.getint(section, 'port')
username = cf.get(section, 'username')
password = cf.get(section, 'password')
info = {"host": host, "port": port, "username": username, "password": password}
cf.clear()
return info
def read_all_server_info():
"""
读取server_info json文件信息
:return:返回信息
"""
try:
with open(server_info_json_path, encoding="utf-8") as file:
file_json = json.load(file)
for data in file_json:
logging.info(data['host'])
return file_json
except Exception as e:
logging.error(e)
return None
def connect(host, username, password, port=22):
"""
连接服务器执行命令
:param command: 执行的命令
:param host: 服务器ip
:param username: 用户名
:param password: 密码
:param port: 端口
:return: 直接结果
"""
logging.info("#####服务器:{}#####".format(host))
client = paramiko.SSHClient()
try:
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(host, port, username=username, password=password, timeout=20)
return client
except Exception as e:
# logging.error(e)
return None
def ssh_exec_command(conn, command):
"""
ssh-执行命令
:param conn:连接
:param command:命令
:return:
"""
results = None
try:
stdin, stdout, stderr = conn.exec_command(command)
results = stdout.readlines()
except Exception as e:
logging.error(e)
return results
def df_exec_command(conn):
"""
检查磁盘
:param conn: 连接
:return: 检查磁盘返回信息
"""
logging.info("#####正在检查磁盘空间,请耐心等待#####")
results = ''.join(ssh_exec_command(conn, 'df -h'))
logging.info(results)
def free_exec_command(conn):
"""
检查服务器内存
:param conn: 连接
:return: 检查内存返回信息
"""
logging.info("#####正在检查内存使用率,请耐心等待#####")
results = ''.join(ssh_exec_command(conn, 'free'))
logging.info(results)
def load_stat_exec_command(conn):
"""
检查服务器负载信息--需要root权限的账号才能检查~否则不巡检
:param conn: 连接
:return: 检查内存返回信息
"""
logging.info("#####正在获取负载信息,请耐心等待#####")
loadavg = {}
loadavg_data = ''.join(ssh_exec_command(conn, 'cat /proc/loadavg'))
loadavg_arr = loadavg_data.split()
loadavg['1分钟'] = loadavg_arr[0]
loadavg['5分钟'] = loadavg_arr[1]
loadavg['15分钟'] = loadavg_arr[2]
loadavg['平均'] = loadavg_arr[3]
loadavg['最近运行的pid'] = loadavg_arr[4]
logging.info(loadavg)
logging.info('\n')
def account_check_exec_command(conn):
"""
检查服务器用户信息
:param conn: 连接
:return: 检查内存返回信息
"""
logging.info("#####正在检查服务器用户信息,请耐心等待#####")
account_list = []
cmd = ''.join(ssh_exec_command(conn, 'cat /etc/shadow'))
user_list = re.split(r'\n', cmd)
for i in user_list:
try:
c = re.search(r'\*|!', i).group()
except:
try:
ok_user = re.findall(r'(.+?):', i)[0]
account_list.append(ok_user)
except:
pass
anonymous_account = os.popen("awk -F: 'length($2)==0 {print $1}' /etc/shadow").read()
account = '存在的账户:\n{0}\n空口令用户:\n{1}\n'.format(account_list, anonymous_account)
return account
def gp_exec_command(conn):
"""
检查gp数据库状态
:param conn:
:return:
"""
logging.info("#####正在检查gp数据库状态,请耐心等待#####\n")
username = ''.join(ssh_exec_command(conn, 'whoami'))
if 'gpadmin' == username.strip():
results = ''.join(ssh_exec_command(conn, 'gpstate -c'))
logging.info(results)
else:
logging.info('当前账号不是gpadmin账号')
def get_gp_sql():
# sql = '''
# select 'mr_lte' as type,dt,count(1) from ${mr_lte_table_name} where dt = ${mr_dt} group by dt
# union all
# select 'mr_nr' as type,dt,count(1) from ${mr_nr_table_name} where dt = ${mr_dt} group by dt
# union all
# select 'kpi_lte' as type,dt,count(1) from ${kpi_lte_table_name} where dt = ${kpi_dt} group by dt
# union all
# select 'kpi_nb' as type,dt,count(1) from ${kpi_nb_table_name} where dt = ${kpi_dt} group by dt
# union all
# select 'kpi_lte_nsa' as type,dt,count(1) from ${kpi_lte_nsa_table_name} where dt = ${kpi_dt} group by dt
# union all
# select 'kpi_nr_nsa_du' as type,dt,count(1) from ${kpi_nr_nsa_du_table_name} where dt = ${kpi_dt} group by dt
# union all
# select 'kpi_nr_nsa_cu' as type,dt,count(1) from ${kpi_nr_nsa_cu_table_name} where dt = ${kpi_dt} group by dt
# union all
# select 'kpi_nr_sa_du' as type,dt,count(1) from ${kpi_nr_nsa_du_table_name} where dt = ${kpi_dt} group by dt
# union all
# select 'kpi_nr_sa_cu' as type,dt,count(1) from ${kpi_nr_nsa_cu_table_name} where dt = ${kpi_dt} group by dt
# '''
sql = '''
select 'mr_lte' as type,dt,count(1) from ${mr_lte_table_name} where dt = (select max(dt) from ${mr_lte_table_name}) group by dt
union all
select 'mr_nr' as type,dt,count(1) from ${mr_nr_table_name} where dt = (select max(dt) from ${mr_nr_table_name}) group by dt
union all
select 'kpi_lte' as type,dt,count(1) from ${kpi_lte_table_name} where dt = (select max(dt) from ${kpi_lte_table_name}) group by dt
union all
select 'kpi_nb' as type, dt,count(1) from ${kpi_nb_table_name} where dt = (select max(dt) from ${kpi_nb_table_name}) group by dt
union all
select 'kpi_lte_nsa' as type,dt,count(1) from ${kpi_lte_nsa_table_name} where dt = (select max(dt) from ${kpi_lte_nsa_table_name}) group by dt
union all
select 'kpi_nr_nsa_du' as type,dt,count(1) from ${kpi_nr_nsa_du_table_name} where dt = (select max(dt) from ${kpi_nr_nsa_du_table_name}) group by dt
union all
select 'kpi_nr_nsa_cu' as type,dt,count(1) from ${kpi_nr_nsa_cu_table_name} where dt = (select max(dt) from ${kpi_nr_nsa_cu_table_name}) group by dt
union all
select 'kpi_nr_sa_du' as type,dt,count(1) from ${kpi_nr_nsa_du_table_name} where dt = (select max(dt) from ${kpi_nr_nsa_du_table_name}) group by dt
union all
select 'kpi_nr_sa_cu' as type,dt,count(1) from ${kpi_nr_nsa_cu_table_name} where dt = (select max(dt) from ${kpi_nr_nsa_cu_table_name}) group by dt
'''
return sql
def replace_table(results):
results = results.replace('max', '').replace('-', '').replace('(1 row)', '').replace('\n', '').strip()
return results
def gp_data_exec_command(conn):
"""
检查gp每日入库数据---已经抛弃
:param conn:
:return:
"""
logging.info("#####正在获取天粒度入库数据,请耐心等待#####\n")
username = ''.join(ssh_exec_command(conn, 'whoami'))
if 'gpadmin' == username.strip():
# results = ''.join(ssh_exec_command(conn, 'gpstate -c'))
# logging.info(results)
# now_time = datetime.datetime.now()
# kpi_month_time = (now_time + datetime.timedelta(days=-1)).strftime("%Y%m")
# mr_month_time = (now_time + datetime.timedelta(days=-2)).strftime("%Y%m")
# pre_kpi_day_time = (now_time + datetime.timedelta(days=-1)).strftime("%Y%m%d")
# pre_mr_day_time = (now_time + datetime.timedelta(days=-2)).strftime("%Y%m%d")
sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'mr_lte_day_2_____' limit 1"
cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
results = ''.join(ssh_exec_command(conn, cmd))
mr_lte_table_name = replace_table(results)
sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'mr_nr_day_2_____' limit 1"
cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
results = ''.join(ssh_exec_command(conn, cmd))
mr_nr_table_name = replace_table(results)
sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'kpi_lte_day_2_____' limit 1"
cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
results = ''.join(ssh_exec_command(conn, cmd))
kpi_lte_table_name = replace_table(results)
sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'kpi_nb_day_2_____' limit 1"
cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
results = ''.join(ssh_exec_command(conn, cmd))
kpi_nb_table_name = replace_table(results)
sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'kpi_lte_nsa_day_2_____' limit 1"
cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
results = ''.join(ssh_exec_command(conn, cmd))
kpi_lte_nsa_table_name = replace_table(results)
sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'kpi_nr_nsa_du_day_2_____' limit 1"
cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
results = ''.join(ssh_exec_command(conn, cmd))
kpi_nr_nsa_du_table_name = replace_table(results)
sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'kpi_nr_nsa_cu_day_2_____' limit 1"
cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
results = ''.join(ssh_exec_command(conn, cmd))
kpi_nr_nsa_cu_table_name = replace_table(results)
sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'kpi_nr_sa_du_day_2_____' limit 1"
cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
results = ''.join(ssh_exec_command(conn, cmd))
kpi_nr_sa_du_table_name = replace_table(results)
sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'kpi_nr_sa_cu_day_2_____' limit 1"
cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
results = ''.join(ssh_exec_command(conn, cmd))
kpi_nr_sa_cu_table_name = replace_table(results)
# mr_lte_table_name = 'mr_lte_day_{}'.format(mr_month_time)
# mr_nr_table_name = 'mr_nr_day_{}'.format(mr_month_time)
#
# kpi_lte_table_name = 'kpi_lte_day_{}'.format(kpi_month_time)
# kpi_nb_table_name = 'kpi_nb_day_{}'.format(kpi_month_time)
# kpi_lte_nsa_table_name = 'kpi_lte_nsa_day_{}'.format(kpi_month_time)
#
# kpi_nr_nsa_du_table_name = 'kpi_nr_nsa_du_day_{}'.format(kpi_month_time)
# kpi_nr_nsa_cu_table_name = 'kpi_nr_nsa_cu_day_{}'.format(kpi_month_time)
# kpi_nr_sa_du_table_name = 'kpi_nr_sa_du_day_{}'.format(kpi_month_time)
# kpi_nr_sa_cu_table_name = 'kpi_nr_sa_cu_day_{}'.format(kpi_month_time)
sql = get_gp_sql().replace('${mr_lte_table_name}', mr_lte_table_name)\
.replace('${mr_nr_table_name}', mr_nr_table_name) \
.replace('${kpi_lte_table_name}', kpi_lte_table_name) \
.replace('${kpi_nb_table_name}', kpi_nb_table_name) \
.replace('${kpi_lte_nsa_table_name}', kpi_lte_nsa_table_name) \
.replace('${kpi_nr_nsa_du_table_name}', kpi_nr_nsa_du_table_name) \
.replace('${kpi_nr_nsa_cu_table_name}', kpi_nr_nsa_cu_table_name) \
.replace('${kpi_nr_sa_du_table_name}', kpi_nr_sa_du_table_name) \
.replace('${kpi_nr_sa_cu_table_name}', kpi_nr_sa_cu_table_name)
cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
results = ''.join(ssh_exec_command(conn, cmd))
logging.info(results)
else:
logging.info('当前账号不是gpadmin账号')
def tomcat_nginx_exec_command(conn):
"""
检查tomcat nginx 状态
:param conn:
:return:
"""
logging.info("#####正在检查 nginx 状态,请耐心等待#####")
results = ''.join(ssh_exec_command(conn, 'ps -ef | grep nginx'))
logging.info(results)
logging.info("#####正在检查 tomcat 状态,请耐心等待#####")
results = ''.join(ssh_exec_command(conn, 'ps -ef | grep tomcat'))
logging.info(results)
def cj_exec_command(conn, type_str, host):
"""
检查采集程序
:param conn:
:param type_str:
:param host:
:return:
"""
process_arr = []
if type_str == 'kpi':
process_arr = ['northbase.Main', 'parse.ParseNRM', 'parse.ParsePM',
'download.MRTaskGenerator', 'summary.KPI2GP', 'summary.MR2GP',
'summary.KPISummary', 'summary.MRSummary', 'redis.LoadWr', 'redis-server']
elif type_str == 'mr':
process_arr = ['download.MRDownload', 'parse.ParseMR', 'redis-server']
if len(process_arr) > 0:
logging.info("#####正在检查{}采集程序是否正常,请耐心等待#####\n".format(type_str))
for process in process_arr:
cmd = 'ps -ef | grep {}| grep -v grep |wc -l'.format(process)
results = ''.join(ssh_exec_command(conn, cmd))
str_input = process+':{}'.format(int(results.replace('\n', '')))
logging.info(str_input)
logging.info('------------------------------------')
input("请确认{}服务器:{}的集团采集程序,按回车键继续...".format(type_str, host))
def read_cdh_config_info():
"""
读取维护页面配置信息
:return:
"""
section = 'cdh_server_info'
conf_file = conf_file_path
cf = configparser.RawConfigParser()
cf.read(conf_file, encoding='utf-8')
host = cf.get(section, 'host')
port = cf.get(section, 'port')
username = cf.get(section, 'username')
password = cf.get(section, 'password')
cluster_name = cf.get(section, 'cluster_name')
info = {"host": host, "cluster_name": cluster_name, "port": port,
"username": username, "password": password}
cf.clear()
return info
def cdh_login_cookie():
try:
logging.info("\n#####正在登录CDH...请耐心等待#####\n")
info = read_cdh_config_info()
headers = {
"Content-Type": "application/x-www-form-urlencoded",
"Accept": "*/*",
"Connection": "keep-alive",
}
host = info['host']
port = info['port']
username = info['username']
password = info['password']
link = 'http://{0}:{1}/j_spring_security_check'.format(host, port)
login_data = {'j_username': username, 'j_password': password}
req = requests.post(link, headers=headers, timeout=100, data=login_data, allow_redirects=False)
cookies = req.cookies['CLOUDERA_MANAGER_SESSIONID']
req.close()
return cookies
except Exception as e:
logging.error('请求登录CDH失败...请手动登录CDH检查服务.....'+e)
return None
def cdh_health_status(health_type):
if health_type == 'DISABLED':
return '服务已禁用(无须检查)'
elif health_type == 'HISTORY_NOT_AVAILABLE':
return '服务长期不可用'
elif health_type == 'NOT_AVAILABLE':
return '服务不可用'
elif health_type == 'GOOD':
return '服务运行状态良好(绿色)'
elif health_type == 'CONCERNING':
return '服务存在告警信息(黄色)'
elif health_type == 'BAD':
return '存在挂起程序,请登录检查反馈情况!!!!(红色)'
def get_cdh_time_series(cookies, host, port):
if cookies is not None and len(cookies) > 0:
try:
headers = {
"Accept": "*/*",
"Connection": "keep-alive",
"Cookie": cookies
}
query_str = 'SELECT total_fd_open_across_regionservers WHERE entityName = "hbase" AND category = SERVICE'
link = 'http://{0}:{1}/api/v19/timeseries?query={2}'.format(host, port, query_str)
req = requests.get(link, headers=headers, timeout=100)
response = req.text
data_json = json.loads(response)
if data_json is not None and data_json['items'] is not None:
items = data_json['items']
for item in items:
if item['timeSeries'] is not None and len(item['timeSeries']) > 0:
data = item['timeSeries'][0]['data']
if len(data) > 0 and data is not None:
return data[0]['value']
req.close()
except Exception as e:
logging.error('请求CDH数据失败...请手动登录CDH检查服务.....')
return None
def get_cdh_clusters_info():
logging.info("#####正在获取CDH服务状态,请耐心等待#####\n")
cookies = cdh_login_cookie()
if cookies is not None and len(cookies) > 0:
info = read_cdh_config_info()
host = info['host']
port = info['port']
cluster_name = info['cluster_name']
cookies = 'CLOUDERA_MANAGER_SESSIONID={}'.format(cookies)
try:
headers = {
"Accept": "*/*",
"Connection": "keep-alive",
"Cookie": cookies
}
link = 'http://{0}:{1}/api/v19/clusters/{2}/services'.format(host, port, cluster_name)
req = requests.get(link, headers=headers, timeout=100)
response = req.text
data_json = json.loads(response)
if data_json is not None and data_json['items'] is not None:
items = data_json['items']
for item in items:
logging.info('------>' + item['name'] + ':' + cdh_health_status(item['healthSummary']) + '<------')
for check in item['healthChecks']:
logging.info(check['name'] + ':' + cdh_health_status(check['summary']))
if item['name'] == 'hbase':
open_file_num = get_cdh_time_series(cookies, host, port)
if open_file_num >= 15000:
logging.info('HBASE打开总文件描述符数大于等于1.5W--建议重启hbase!!!!')
else:
logging.info('HBASE打开总文件描述符数:{}'.format(open_file_num))
logging.info('\n')
input("请确认{}服务状态,按回车键继续...".format(item['name']))
logging.info('\n')
link = 'http://{0}:{1}/api/v19/cm/service'.format(host, port)
req = requests.get(link, headers=headers, timeout=100)
response = req.text
data_json = json.loads(response)
if data_json is not None:
logging.info('------>' + data_json['displayName'] + ':' + cdh_health_status(data_json['healthSummary']) + '<------')
for check in data_json['healthChecks']:
logging.info(check['name'] + ':' + cdh_health_status(check['summary']))
logging.info('\n')
req.close()
except Exception as e:
logging.error('请求登录CDH失败...请手动登录CDH检查服务.....')
else:
logging.error('获取CDH--cookie失败...请手动登录CDH检查服务.....')
def main():
false_server_info = []
logging.info('开始获取服务器所有信息...')
server_info = read_all_server_info()
logging.info("\n###########################\n")
time.sleep(1)
input("请确认待巡检的服务器ip信息,按回车键继续。")
for info in server_info:
host = info['host']
username = info['username']
password = info['password']
port = info['port']
type_str = info['type']
conn = connect(host, username, password, port)
if conn is None:
logging.info('请检查{}服务器:{}的连通性...'.format(type_str, host))
false_server_info.append(host)
else:
df_exec_command(conn)
free_exec_command(conn)
load_stat_exec_command(conn)
input("请确认{}服务器:{}的磁盘/内存/服务器负载使用信息,按回车键继续...".format(type_str, host))
time.sleep(1)
logging.info("\n")
cj_exec_command(conn, type_str, host)
# input("请确认{}服务器:{}的集团采集程序,按回车键继续...".format(type_str, host))
if type_str == 'gp':
gp_exec_command(conn)
input("请确认GP数据库状态,按回车键继续...")
conn.close()
logging.info("\n-------------------------------------------------------------------------\n")
# server_info = read_config_info('gp_server_info')
# conn = connect(server_info['host'], server_info['username'], server_info['password'], server_info['port'])
# gp_exec_command(conn)
# input("请确认GP数据库状态,按回车键继续...")
# 数据库昨日入库情况
get_nbd()
input("请确认天粒度入库数据是否正常,按回车键继续...")
# CDH监控...
get_cdh_clusters_info()
input("请确认CDH集群状态是否正常,按回车键继续...")
logging.info("\n")
server_info = read_config_info('tomcat_nginx_server_info')
conn = connect(server_info['host'], server_info['username'], server_info['password'], server_info['port'])
tomcat_nginx_exec_command(conn)
input("请确认tomcat/nginx状态,按回车键继续...")
conn.close()
logging.info("\n")
logging.info('最后.请检查以下服务器的连通性:{}'.format(false_server_info))
if __name__ == '__main__':
main()
logging.info('end....')
3.配置python环境及需要安装的依赖
python这边是使用python3.8的,我想快速安装,所以使用Miniconda3-latest-Linux-x86_64.sh
依赖分别是ssh:
PyNaCl-1.4.0/paramiko-2.7.2/bcrypt-3.2.0
requests:
requests-2.22.0-py2.py3-none-any
1.首先创建一个python_dev账号:
# 新建账号
adduser python_dev
# 设置密码
passwd python_dev
# 密码
Rj#Wvk031lyCx2
2.安装python
[python_dev@sxhdp01datanode05 python]$ chmod +x Miniconda3-latest-Linux-x86_64.sh
[python_dev@sxhdp01datanode05 python]$ ./Miniconda3-latest-Linux-x86_64.sh
installation finished.
Do you wish the installer to initialize Miniconda3
by running conda init? [yes|no]
[no] >>> no #这里我自己是不喜欢初始化的所以选择了no,如果想设置初始化可以是yes
3.安装依赖
# 解压
tar -zxvf bcrypt-3.2.0.tar.gz
# 进入文件夹并安装依赖
cd bcrypt-3.2.0
~/miniconda3/bin/python setup.py install
# 查看是否安装成功
~/miniconda3/bin/pip list
#同理
tar -zxvf PyNaCl-1.4.0.tar.gz
tar -zxvf paramiko-2.7.2.tar.gz
.....
#安装 request,因为是whl包所以是用Pip安装
~/miniconda3/bin/pip install requests-2.22.0-py2.py3-none-any.whl
4.手动运行执行
~/miniconda3/bin/python check_server.py