配置安装linux自动化巡检工具步骤
本文最后更新于 1211 天前,其中的信息可能已经有所发展或是发生改变。

1.需要配置两个文件

config.conf:

#gp-master信息
[gp_server_info]
host=127.0.0.1
port=29001
username=gpadmin
password=111

#tomcat-nginx信息
[tomcat_nginx_server_info]
host=127.0.0.1
port=29001
username=gpadmin
password=111

#维护页面信息
[nbd_server_info]
login_url=http://ip/nbdGD/bigdata/user/login
check_all_url=http://ip/nbdGD/bigdata/monitor/gp
service_status_url=http://ip/nbdGD/bigdata/monitor/congestion
mr_service_url=http://ip/nbdGD/bigdata/monitor/mrsender
username=admin
password=111

#CDH信息
[cdh_server_info]
host=127.0.0.1
port=57182
username=admin
password=1111
cluster_name=Cluster 1

server_info.json 服务器信息

type:是区分数据库的使用类型,如gp是master节点,mr是代表采集mr服务器,KPI是代表采集KPI服务器

[
    {
        "host": "127.0.0.1",
        "username": "root",
        "password": "111!@#",
        "port": 29001,
        "type": "gp"
    },
    {
        "host": "127.0.0.1",
        "username": "root",
        "password": "111!@#",
        "port": 29001,
        "type": "sdw"
    },
    {
        "host": "127.0.0.1",
        "username": "root",
        "password": "111!@#",
        "port": 29001,
        "type": "mr"
    },
    {
        "host": "127.0.0.1",
        "username": "root",
        "password": "111!@#",
        "port": 29001,
        "type": "kpi"
    }
]

2.主代码如下:

#!/usr/bin/python3
# -*- coding: UTF-8 -*-

import configparser
import datetime
import json
import logging
import os
import platform
import random
import re
import sys
import time
import paramiko
import requests

"""
全局函数区域...
"""
banner = """\033[1;34m
////////////////////////////////////////////////////////////////////
//                          _ooOoo_                               //
//                         o8888888o                              //
//                         88" . "88                              //
//                         (| ^_^ |)                              //
//                         O\  =  /O                              //
//                      ____/`---'\____                           //
//                    .'  \\|     |//  `.                         //
//                   /  \\|||  :  |||//  \                        //
//                  /  _||||| -:- |||||-  \                       //
//                  |   | \\\  -  /// |   |                       //
//                  | \_|  ''\---/''  |   |                       //
//                  \  .-\__  `-`  ___/-. /                       //
//                ___`. .'  /--.--\  `. . ___                     //
//              ."" '<  `.___\_<|>_/___.'  >'"".                  //
//            | | :  `- \`.;`\ _ /`;.`/ - ` : | |                 //
//            \  \ `-.   \_ __\ /__ _/   .-` /  /                 //
//      ========`-.____`-.___\_____/___.-`____.-'========         //
//                           `=---='                              //
//      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^        //
//            佛祖保佑       永不宕机     永无BUG                    //
////////////////////////////////////////////////////////////////////
linux自动化巡检工具           Version:1.0
Company:hcxt                Author:hcxt
"""
print(banner)
# 系统类型
system_type = platform.system()
# 当前路径
work_dir = os.getcwd()
# 日志目录
log_path = os.path.join(work_dir, 'logs')
# 当前时间
cur_day = datetime.datetime.now()
cur_day_hour_str = datetime.datetime.strftime(cur_day, "%Y%m%d%H")
cur_day_str = datetime.datetime.strftime(cur_day, "%Y%m%d")
conf_file_path = './config/config.conf'
server_info_json_path = './config/server_info.json'
user_Agent_list = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.74",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Opera/9.80 (Windows NT 10.0; U; zh-cn) Presto/2.9.168 Version/11.50",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36"
]


def read_nbd_config_info():
    """
    读取维护页面配置信息
    :return:
    """
    section = 'nbd_server_info'
    conf_file = conf_file_path
    cf = configparser.RawConfigParser()
    cf.read(conf_file, encoding='utf-8')
    login_url = cf.get(section, 'login_url')
    check_all_url = cf.get(section, 'check_all_url')
    service_status_url = cf.get(section, 'service_status_url')
    mr_service_url = cf.get(section, 'mr_service_url')
    username = cf.get(section, 'username')
    password = cf.get(section, 'password')
    info = {"login_url": login_url, "check_all_url": check_all_url,
            "mr_service_url": mr_service_url, "service_status_url": service_status_url,
            "username": username, "password": password}
    cf.clear()
    return info


def get_nbd_login_url_info():
    """
    登录获取ak-cookies
    :return:
    """
    nbd_info = read_nbd_config_info()
    login_url = nbd_info['login_url']
    username = nbd_info['username']
    password = nbd_info['password']
    try:
        headers = {
            "Content-Type": "application/x-www-form-urlencoded",
            "Accept": "*/*",
            "Connection": "keep-alive",
            "User-Agent": random.choice(user_Agent_list),
        }
        login_data = {'username': username, 'password': password}
        req = requests.post(login_url, headers=headers, timeout=200, data=login_data)
        response = req.text
        if req.ok:
            logging.info('成功登录维护页面,获取数据中.....')
            return json.loads(response)
        else:
            return None
        req.close()
    except Exception as e:
        logging.error('请求出错啦...')
        return None


def get_nbd():
    """
    查询获取天粒度入库数据
    :return:
    """
    logging.info("#####正在获取天粒度入库数据,请耐心等待#####\n")
    login_data = get_nbd_login_url_info()
    if login_data is not None:
        if login_data['success'] == 1:
            ak = login_data['ak']
            nbd_info = read_nbd_config_info()
            check_all_url = nbd_info['check_all_url']
            username = nbd_info['username']
            service_status_url = nbd_info['service_status_url']
            mr_service_url = nbd_info['mr_service_url']
            try:
                link = check_all_url + '?username={0}&usercity=ALL&ak={1}'.format(username, ak)
                req = requests.get(link, timeout=100)
                response = req.text
                data_json = json.loads(response)
                # logging.info("#####正在获取天粒度入库数据,请耐心等待#####\n")
                if data_json is not None and len(data_json) > 0:
                    day_infos = ['mr_nr_day', 'mr_lte_day', 'kpi_lte_day', 'kpi_lte_nsa_day',
                            'kpi_nr_nsa_du_day', 'kpi_nr_nsa_cu_day', 'kpi_nr_sa_du_day', 'kpi_nr_sa_cu_day']
                    for data in data_json:
                        for info in day_infos:
                            if data[info] is not None and len(data[info]) > 0:
                                logging.info(info+':'+str(data[info]))
                            else:
                                logging.info(info + ':{}')

                logging.info("\n#####正在获取本地平台当前程序状态数据,请耐心等待#####\n")
                link = service_status_url + '?username={0}&usercity=ALL&ak={1}'.format(username, ak)
                req = requests.get(link, timeout=100)
                response = req.text
                data_json = json.loads(response)
                if data_json is not None and data_json['Datas'] is not None:
                    # type_info = ['mr', 'nrm', 'mdt', 'pm']
                    # for t in type_info:
                    #     logging.info('待处理' + t + ':' + str(data_json['Datas'][t]) + '文件')
                    for key in data_json['Datas']:
                        logging.info('待处理' + key + ':' + str(data_json['Datas'][key]) + '文件')

                logging.info("\n#####正在获取MR服务器今日处理MR小区数 ,请耐心等待#####\n")
                link = mr_service_url + '?username={0}&usercity=ALL&ak={1}'.format(username, ak)
                req = requests.get(link, timeout=100)
                response = req.text
                data_json = json.loads(response)
                if data_json is not None and data_json['Datas'] is not None:
                    for key in sorted(data_json['Datas'], key=None, reverse=False):
                        logging.info('服务器ip:' + key + '-|-今日处理小区数:' + str(data_json['Datas'][key]))
                req.close()
            except Exception as e:
                print(e)
                logging.error("请求失败...")

        else:
            logging.error('登录失败请检查账号密码...')
    else:
        logging.error('访问失败,请检查维护页面程序是否正常...')


def mkdir(path):
    """
    本函数用于创建目录,如果没有,则创建,有则什么也不做
    :param path: 需要创建的目录
    :return:True或False。如果目录已存在,则返回False,如果成功创建目录,则返回True.
    """
    path = path.strip()
    is_exists = os.path.exists(path)
    if not is_exists:
        os.makedirs(path)
        return True
    else:
        return False


# 设置日志
mkdir(log_path)
# log_format = "%(asctime)s - %(levelname)s - %(message)s"
# log_format = "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s"
log_file = os.path.join(log_path, "%s.log" % cur_day_hour_str)
# logging.basicConfig(filename=log_file, level=logging.DEBUG, format=log_format)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter()
fh = logging.FileHandler(os.path.join(log_path, log_file))
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.addHandler(fh)
logger.info("检查时间: %s" % datetime.datetime.strftime(cur_day, "%Y%m%d %H:%M:%S"))


def read_config_info(section):
    """
    读取配置文件的服务器信息
    :param section:属于哪个部分
    :return:返回服务器信息
    """
    conf_file = conf_file_path
    cf = configparser.RawConfigParser()
    cf.read(conf_file, encoding='utf-8')
    host = cf.get(section, 'host')
    port = cf.getint(section, 'port')
    username = cf.get(section, 'username')
    password = cf.get(section, 'password')
    info = {"host": host, "port": port, "username": username, "password": password}
    cf.clear()
    return info


def read_all_server_info():
    """
    读取server_info json文件信息
    :return:返回信息
    """
    try:
        with open(server_info_json_path, encoding="utf-8") as file:
            file_json = json.load(file)
            for data in file_json:
                logging.info(data['host'])
            return file_json
    except Exception as e:
        logging.error(e)
        return None


def connect(host, username, password, port=22):
    """
    连接服务器执行命令
    :param command: 执行的命令
    :param host: 服务器ip
    :param username: 用户名
    :param password: 密码
    :param port: 端口
    :return: 直接结果
    """
    logging.info("#####服务器:{}#####".format(host))
    client = paramiko.SSHClient()
    try:
        client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        client.connect(host, port, username=username, password=password, timeout=20)
        return client
    except Exception as e:
        # logging.error(e)
        return None


def ssh_exec_command(conn, command):
    """
    ssh-执行命令
    :param conn:连接
    :param command:命令
    :return:
    """
    results = None
    try:
        stdin, stdout, stderr = conn.exec_command(command)
        results = stdout.readlines()
    except Exception as e:
        logging.error(e)
    return results


def df_exec_command(conn):
    """
    检查磁盘
    :param conn: 连接
    :return: 检查磁盘返回信息
    """
    logging.info("#####正在检查磁盘空间,请耐心等待#####")
    results = ''.join(ssh_exec_command(conn, 'df -h'))
    logging.info(results)


def free_exec_command(conn):
    """
    检查服务器内存
    :param conn: 连接
    :return: 检查内存返回信息
    """
    logging.info("#####正在检查内存使用率,请耐心等待#####")
    results = ''.join(ssh_exec_command(conn, 'free'))
    logging.info(results)


def load_stat_exec_command(conn):
    """
    检查服务器负载信息--需要root权限的账号才能检查~否则不巡检
    :param conn: 连接
    :return: 检查内存返回信息
    """
    logging.info("#####正在获取负载信息,请耐心等待#####")
    loadavg = {}
    loadavg_data = ''.join(ssh_exec_command(conn, 'cat /proc/loadavg'))
    loadavg_arr = loadavg_data.split()
    loadavg['1分钟'] = loadavg_arr[0]
    loadavg['5分钟'] = loadavg_arr[1]
    loadavg['15分钟'] = loadavg_arr[2]
    loadavg['平均'] = loadavg_arr[3]
    loadavg['最近运行的pid'] = loadavg_arr[4]
    logging.info(loadavg)
    logging.info('\n')


def account_check_exec_command(conn):
    """
    检查服务器用户信息
    :param conn: 连接
    :return: 检查内存返回信息
    """
    logging.info("#####正在检查服务器用户信息,请耐心等待#####")
    account_list = []
    cmd = ''.join(ssh_exec_command(conn, 'cat /etc/shadow'))
    user_list = re.split(r'\n', cmd)
    for i in user_list:
        try:
            c = re.search(r'\*|!', i).group()
        except:
            try:
                ok_user = re.findall(r'(.+?):', i)[0]
                account_list.append(ok_user)
            except:
                pass
    anonymous_account = os.popen("awk -F: 'length($2)==0 {print $1}' /etc/shadow").read()
    account = '存在的账户:\n{0}\n空口令用户:\n{1}\n'.format(account_list, anonymous_account)
    return account


def gp_exec_command(conn):
    """
    检查gp数据库状态
    :param conn:
    :return:
    """
    logging.info("#####正在检查gp数据库状态,请耐心等待#####\n")
    username = ''.join(ssh_exec_command(conn, 'whoami'))
    if 'gpadmin' == username.strip():
        results = ''.join(ssh_exec_command(conn, 'gpstate -c'))
        logging.info(results)
    else:
        logging.info('当前账号不是gpadmin账号')


def get_gp_sql():
#     sql = '''
# select 'mr_lte' as type,dt,count(1) from ${mr_lte_table_name} where dt = ${mr_dt} group by dt
# union all
# select 'mr_nr' as type,dt,count(1) from ${mr_nr_table_name} where dt = ${mr_dt} group by dt
# union all
# select 'kpi_lte' as type,dt,count(1) from ${kpi_lte_table_name} where dt = ${kpi_dt} group by dt
# union all
# select 'kpi_nb' as type,dt,count(1) from ${kpi_nb_table_name} where dt = ${kpi_dt} group by dt
# union all
# select 'kpi_lte_nsa' as type,dt,count(1) from ${kpi_lte_nsa_table_name} where dt = ${kpi_dt} group by dt
# union all
# select 'kpi_nr_nsa_du' as type,dt,count(1) from ${kpi_nr_nsa_du_table_name} where dt = ${kpi_dt} group by dt
# union all
# select 'kpi_nr_nsa_cu' as type,dt,count(1) from ${kpi_nr_nsa_cu_table_name} where dt = ${kpi_dt} group by dt
# union all
# select 'kpi_nr_sa_du' as type,dt,count(1) from ${kpi_nr_nsa_du_table_name} where dt = ${kpi_dt} group by dt
# union all
# select 'kpi_nr_sa_cu' as type,dt,count(1) from ${kpi_nr_nsa_cu_table_name} where dt = ${kpi_dt} group by dt
#     '''
    sql = '''
select 'mr_lte' as type,dt,count(1) from ${mr_lte_table_name} where dt = (select max(dt) from ${mr_lte_table_name}) group by dt
union all
select 'mr_nr' as type,dt,count(1) from ${mr_nr_table_name} where dt = (select max(dt) from ${mr_nr_table_name}) group by dt
union all
select 'kpi_lte' as type,dt,count(1) from ${kpi_lte_table_name} where dt = (select max(dt) from ${kpi_lte_table_name}) group by dt
union all
select 'kpi_nb' as type, dt,count(1) from ${kpi_nb_table_name} where dt = (select max(dt) from ${kpi_nb_table_name}) group by dt
union all
select 'kpi_lte_nsa' as type,dt,count(1) from ${kpi_lte_nsa_table_name} where dt = (select max(dt) from ${kpi_lte_nsa_table_name}) group by dt
union all
select 'kpi_nr_nsa_du' as type,dt,count(1) from ${kpi_nr_nsa_du_table_name} where dt = (select max(dt) from ${kpi_nr_nsa_du_table_name}) group by dt
union all
select 'kpi_nr_nsa_cu' as type,dt,count(1) from ${kpi_nr_nsa_cu_table_name}  where dt = (select max(dt) from ${kpi_nr_nsa_cu_table_name}) group by dt
union all
select 'kpi_nr_sa_du' as type,dt,count(1) from ${kpi_nr_nsa_du_table_name}  where dt = (select max(dt) from ${kpi_nr_nsa_du_table_name}) group by dt
union all
select 'kpi_nr_sa_cu' as type,dt,count(1) from ${kpi_nr_nsa_cu_table_name} where dt = (select max(dt) from ${kpi_nr_nsa_cu_table_name}) group by dt
    '''
    return sql


def replace_table(results):
    results = results.replace('max', '').replace('-', '').replace('(1 row)', '').replace('\n', '').strip()
    return results


def gp_data_exec_command(conn):
    """
    检查gp每日入库数据---已经抛弃
    :param conn:
    :return:
    """
    logging.info("#####正在获取天粒度入库数据,请耐心等待#####\n")
    username = ''.join(ssh_exec_command(conn, 'whoami'))
    if 'gpadmin' == username.strip():
        # results = ''.join(ssh_exec_command(conn, 'gpstate -c'))
        # logging.info(results)
        # now_time = datetime.datetime.now()
        # kpi_month_time = (now_time + datetime.timedelta(days=-1)).strftime("%Y%m")
        # mr_month_time = (now_time + datetime.timedelta(days=-2)).strftime("%Y%m")
        # pre_kpi_day_time = (now_time + datetime.timedelta(days=-1)).strftime("%Y%m%d")
        # pre_mr_day_time = (now_time + datetime.timedelta(days=-2)).strftime("%Y%m%d")

        sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'mr_lte_day_2_____' limit 1"
        cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
        results = ''.join(ssh_exec_command(conn, cmd))
        mr_lte_table_name = replace_table(results)

        sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'mr_nr_day_2_____' limit 1"
        cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
        results = ''.join(ssh_exec_command(conn, cmd))
        mr_nr_table_name = replace_table(results)

        sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'kpi_lte_day_2_____' limit 1"
        cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
        results = ''.join(ssh_exec_command(conn, cmd))
        kpi_lte_table_name = replace_table(results)

        sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'kpi_nb_day_2_____' limit 1"
        cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
        results = ''.join(ssh_exec_command(conn, cmd))
        kpi_nb_table_name = replace_table(results)

        sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'kpi_lte_nsa_day_2_____' limit 1"
        cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
        results = ''.join(ssh_exec_command(conn, cmd))
        kpi_lte_nsa_table_name = replace_table(results)

        sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'kpi_nr_nsa_du_day_2_____' limit 1"
        cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
        results = ''.join(ssh_exec_command(conn, cmd))
        kpi_nr_nsa_du_table_name = replace_table(results)

        sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'kpi_nr_nsa_cu_day_2_____' limit 1"
        cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
        results = ''.join(ssh_exec_command(conn, cmd))
        kpi_nr_nsa_cu_table_name = replace_table(results)

        sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'kpi_nr_sa_du_day_2_____' limit 1"
        cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
        results = ''.join(ssh_exec_command(conn, cmd))
        kpi_nr_sa_du_table_name = replace_table(results)

        sql = "SELECT max(tablename) FROM pg_tables WHERE schemaname = 'smartinsight' and tablename like 'kpi_nr_sa_cu_day_2_____' limit 1"
        cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
        results = ''.join(ssh_exec_command(conn, cmd))
        kpi_nr_sa_cu_table_name = replace_table(results)
        # mr_lte_table_name = 'mr_lte_day_{}'.format(mr_month_time)
        # mr_nr_table_name = 'mr_nr_day_{}'.format(mr_month_time)
        #
        # kpi_lte_table_name = 'kpi_lte_day_{}'.format(kpi_month_time)
        # kpi_nb_table_name = 'kpi_nb_day_{}'.format(kpi_month_time)
        # kpi_lte_nsa_table_name = 'kpi_lte_nsa_day_{}'.format(kpi_month_time)
        #
        # kpi_nr_nsa_du_table_name = 'kpi_nr_nsa_du_day_{}'.format(kpi_month_time)
        # kpi_nr_nsa_cu_table_name = 'kpi_nr_nsa_cu_day_{}'.format(kpi_month_time)
        # kpi_nr_sa_du_table_name = 'kpi_nr_sa_du_day_{}'.format(kpi_month_time)
        # kpi_nr_sa_cu_table_name = 'kpi_nr_sa_cu_day_{}'.format(kpi_month_time)

        sql = get_gp_sql().replace('${mr_lte_table_name}', mr_lte_table_name)\
            .replace('${mr_nr_table_name}', mr_nr_table_name) \
            .replace('${kpi_lte_table_name}', kpi_lte_table_name) \
            .replace('${kpi_nb_table_name}', kpi_nb_table_name) \
            .replace('${kpi_lte_nsa_table_name}', kpi_lte_nsa_table_name) \
            .replace('${kpi_nr_nsa_du_table_name}', kpi_nr_nsa_du_table_name) \
            .replace('${kpi_nr_nsa_cu_table_name}', kpi_nr_nsa_cu_table_name) \
            .replace('${kpi_nr_sa_du_table_name}', kpi_nr_sa_du_table_name) \
            .replace('${kpi_nr_sa_cu_table_name}', kpi_nr_sa_cu_table_name)
        cmd = 'psql -d lte_mr -U gpadmin -c "{}"'.format(sql.strip())
        results = ''.join(ssh_exec_command(conn, cmd))
        logging.info(results)
    else:
        logging.info('当前账号不是gpadmin账号')


def tomcat_nginx_exec_command(conn):
    """
    检查tomcat nginx 状态
    :param conn:
    :return:
    """
    logging.info("#####正在检查 nginx 状态,请耐心等待#####")
    results = ''.join(ssh_exec_command(conn, 'ps -ef | grep nginx'))
    logging.info(results)
    logging.info("#####正在检查 tomcat 状态,请耐心等待#####")
    results = ''.join(ssh_exec_command(conn, 'ps -ef | grep tomcat'))
    logging.info(results)


def cj_exec_command(conn, type_str, host):
    """
    检查采集程序
    :param conn:
    :param type_str:
    :param host:
    :return:
    """
    process_arr = []
    if type_str == 'kpi':
        process_arr = ['northbase.Main', 'parse.ParseNRM', 'parse.ParsePM',
                   'download.MRTaskGenerator', 'summary.KPI2GP', 'summary.MR2GP',
                   'summary.KPISummary', 'summary.MRSummary', 'redis.LoadWr', 'redis-server']
    elif type_str == 'mr':
        process_arr = ['download.MRDownload', 'parse.ParseMR', 'redis-server']

    if len(process_arr) > 0:
        logging.info("#####正在检查{}采集程序是否正常,请耐心等待#####\n".format(type_str))
        for process in process_arr:
            cmd = 'ps -ef | grep {}| grep -v grep |wc -l'.format(process)
            results = ''.join(ssh_exec_command(conn, cmd))
            str_input = process+':{}'.format(int(results.replace('\n', '')))
            logging.info(str_input)
            logging.info('------------------------------------')
        input("请确认{}服务器:{}的集团采集程序,按回车键继续...".format(type_str, host))


def read_cdh_config_info():
    """
    读取维护页面配置信息
    :return:
    """
    section = 'cdh_server_info'
    conf_file = conf_file_path
    cf = configparser.RawConfigParser()
    cf.read(conf_file, encoding='utf-8')
    host = cf.get(section, 'host')
    port = cf.get(section, 'port')
    username = cf.get(section, 'username')
    password = cf.get(section, 'password')
    cluster_name = cf.get(section, 'cluster_name')
    info = {"host": host, "cluster_name": cluster_name, "port": port,
            "username": username, "password": password}
    cf.clear()
    return info


def cdh_login_cookie():
    try:
        logging.info("\n#####正在登录CDH...请耐心等待#####\n")
        info = read_cdh_config_info()
        headers = {
            "Content-Type": "application/x-www-form-urlencoded",
            "Accept": "*/*",
            "Connection": "keep-alive",
        }
        host = info['host']
        port = info['port']
        username = info['username']
        password = info['password']
        link = 'http://{0}:{1}/j_spring_security_check'.format(host, port)
        login_data = {'j_username': username, 'j_password': password}
        req = requests.post(link, headers=headers, timeout=100, data=login_data,  allow_redirects=False)
        cookies = req.cookies['CLOUDERA_MANAGER_SESSIONID']
        req.close()
        return cookies
    except Exception as e:
        logging.error('请求登录CDH失败...请手动登录CDH检查服务.....'+e)
        return None


def cdh_health_status(health_type):
    if health_type == 'DISABLED':
        return '服务已禁用(无须检查)'
    elif health_type == 'HISTORY_NOT_AVAILABLE':
        return '服务长期不可用'
    elif health_type == 'NOT_AVAILABLE':
        return '服务不可用'
    elif health_type == 'GOOD':
        return '服务运行状态良好(绿色)'
    elif health_type == 'CONCERNING':
        return '服务存在告警信息(黄色)'
    elif health_type == 'BAD':
        return '存在挂起程序,请登录检查反馈情况!!!!(红色)'


def get_cdh_time_series(cookies, host, port):
    if cookies is not None and len(cookies) > 0:
        try:
            headers = {
                "Accept": "*/*",
                "Connection": "keep-alive",
                "Cookie": cookies
            }
            query_str = 'SELECT total_fd_open_across_regionservers WHERE entityName = "hbase" AND category = SERVICE'
            link = 'http://{0}:{1}/api/v19/timeseries?query={2}'.format(host, port, query_str)
            req = requests.get(link, headers=headers, timeout=100)
            response = req.text
            data_json = json.loads(response)
            if data_json is not None and data_json['items'] is not None:
                items = data_json['items']
                for item in items:
                    if item['timeSeries'] is not None and len(item['timeSeries']) > 0:
                        data = item['timeSeries'][0]['data']
                        if len(data) > 0 and data is not None:
                            return data[0]['value']
            req.close()
        except Exception as e:
            logging.error('请求CDH数据失败...请手动登录CDH检查服务.....')
            return None


def get_cdh_clusters_info():
    logging.info("#####正在获取CDH服务状态,请耐心等待#####\n")
    cookies = cdh_login_cookie()
    if cookies is not None and len(cookies) > 0:
        info = read_cdh_config_info()
        host = info['host']
        port = info['port']
        cluster_name = info['cluster_name']
        cookies = 'CLOUDERA_MANAGER_SESSIONID={}'.format(cookies)
        try:
            headers = {
                "Accept": "*/*",
                "Connection": "keep-alive",
                "Cookie": cookies
            }
            link = 'http://{0}:{1}/api/v19/clusters/{2}/services'.format(host, port, cluster_name)
            req = requests.get(link, headers=headers, timeout=100)
            response = req.text
            data_json = json.loads(response)
            if data_json is not None and data_json['items'] is not None:
                items = data_json['items']
                for item in items:
                    logging.info('------>' + item['name'] + ':' + cdh_health_status(item['healthSummary']) + '<------')
                    for check in item['healthChecks']:
                        logging.info(check['name'] + ':' + cdh_health_status(check['summary']))
                    if item['name'] == 'hbase':
                        open_file_num = get_cdh_time_series(cookies, host, port)
                        if open_file_num >= 15000:
                            logging.info('HBASE打开总文件描述符数大于等于1.5W--建议重启hbase!!!!')
                        else:
                            logging.info('HBASE打开总文件描述符数:{}'.format(open_file_num))
                    logging.info('\n')
                    input("请确认{}服务状态,按回车键继续...".format(item['name']))
                    logging.info('\n')
            link = 'http://{0}:{1}/api/v19/cm/service'.format(host, port)
            req = requests.get(link, headers=headers, timeout=100)
            response = req.text
            data_json = json.loads(response)
            if data_json is not None:
                logging.info('------>' + data_json['displayName'] + ':' + cdh_health_status(data_json['healthSummary']) + '<------')
                for check in data_json['healthChecks']:
                    logging.info(check['name'] + ':' + cdh_health_status(check['summary']))
                    logging.info('\n')
            req.close()
        except Exception as e:
            logging.error('请求登录CDH失败...请手动登录CDH检查服务.....')
    else:
        logging.error('获取CDH--cookie失败...请手动登录CDH检查服务.....')


def main():
    false_server_info = []
    logging.info('开始获取服务器所有信息...')
    server_info = read_all_server_info()
    logging.info("\n###########################\n")
    time.sleep(1)
    input("请确认待巡检的服务器ip信息,按回车键继续。")
    for info in server_info:
        host = info['host']
        username = info['username']
        password = info['password']
        port = info['port']
        type_str = info['type']
        conn = connect(host, username, password, port)
        if conn is None:
            logging.info('请检查{}服务器:{}的连通性...'.format(type_str, host))
            false_server_info.append(host)
        else:
            df_exec_command(conn)
            free_exec_command(conn)
            load_stat_exec_command(conn)
            input("请确认{}服务器:{}的磁盘/内存/服务器负载使用信息,按回车键继续...".format(type_str, host))
            time.sleep(1)
            logging.info("\n")
            cj_exec_command(conn, type_str, host)
            # input("请确认{}服务器:{}的集团采集程序,按回车键继续...".format(type_str, host))
            if type_str == 'gp':
                gp_exec_command(conn)
                input("请确认GP数据库状态,按回车键继续...")
            conn.close()
        logging.info("\n-------------------------------------------------------------------------\n")

    # server_info = read_config_info('gp_server_info')
    # conn = connect(server_info['host'], server_info['username'], server_info['password'], server_info['port'])
    # gp_exec_command(conn)
    # input("请确认GP数据库状态,按回车键继续...")

    # 数据库昨日入库情况
    get_nbd()
    input("请确认天粒度入库数据是否正常,按回车键继续...")
    # CDH监控...
    get_cdh_clusters_info()
    input("请确认CDH集群状态是否正常,按回车键继续...")
    logging.info("\n")

    server_info = read_config_info('tomcat_nginx_server_info')
    conn = connect(server_info['host'], server_info['username'], server_info['password'], server_info['port'])
    tomcat_nginx_exec_command(conn)
    input("请确认tomcat/nginx状态,按回车键继续...")
    conn.close()
    logging.info("\n")
    logging.info('最后.请检查以下服务器的连通性:{}'.format(false_server_info))


if __name__ == '__main__':
    main()
    logging.info('end....')

3.配置python环境及需要安装的依赖

python这边是使用python3.8的,我想快速安装,所以使用Miniconda3-latest-Linux-x86_64.sh

依赖分别是ssh:

PyNaCl-1.4.0/paramiko-2.7.2/bcrypt-3.2.0

requests:

requests-2.22.0-py2.py3-none-any

1.首先创建一个python_dev账号:
# 新建账号
adduser python_dev
# 设置密码
passwd python_dev
# 密码
Rj#Wvk031lyCx2
2.安装python
[python_dev@sxhdp01datanode05 python]$ chmod +x Miniconda3-latest-Linux-x86_64.sh 
[python_dev@sxhdp01datanode05 python]$ ./Miniconda3-latest-Linux-x86_64.sh 


installation finished.
Do you wish the installer to initialize Miniconda3
by running conda init? [yes|no]
[no] >>> no  #这里我自己是不喜欢初始化的所以选择了no,如果想设置初始化可以是yes
3.安装依赖
# 解压
tar -zxvf  bcrypt-3.2.0.tar.gz
# 进入文件夹并安装依赖
cd bcrypt-3.2.0
~/miniconda3/bin/python setup.py install
# 查看是否安装成功
~/miniconda3/bin/pip list

#同理
tar -zxvf PyNaCl-1.4.0.tar.gz
tar -zxvf  paramiko-2.7.2.tar.gz
.....

#安装 request,因为是whl包所以是用Pip安装
~/miniconda3/bin/pip install requests-2.22.0-py2.py3-none-any.whl 
4.手动运行执行
~/miniconda3/bin/python check_server.py

 

暂无评论

发送评论 编辑评论


				
|´・ω・)ノ
ヾ(≧∇≦*)ゝ
(☆ω☆)
(╯‵□′)╯︵┴─┴
 ̄﹃ ̄
(/ω\)
∠( ᐛ 」∠)_
(๑•̀ㅁ•́ฅ)
→_→
୧(๑•̀⌄•́๑)૭
٩(ˊᗜˋ*)و
(ノ°ο°)ノ
(´இ皿இ`)
⌇●﹏●⌇
(ฅ´ω`ฅ)
(╯°A°)╯︵○○○
φ( ̄∇ ̄o)
ヾ(´・ ・`。)ノ"
( ง ᵒ̌皿ᵒ̌)ง⁼³₌₃
(ó﹏ò。)
Σ(っ °Д °;)っ
( ,,´・ω・)ノ"(´っω・`。)
╮(╯▽╰)╭
o(*////▽////*)q
>﹏<
( ๑´•ω•) "(ㆆᴗㆆ)
😂
😀
😅
😊
🙂
🙃
😌
😍
😘
😜
😝
😏
😒
🙄
😳
😡
😔
😫
😱
😭
💩
👻
🙌
🖕
👍
👫
👬
👭
🌚
🌝
🙈
💊
😶
🙏
🍦
🍉
😣
Source: github.com/k4yt3x/flowerhd
颜文字
Emoji
小恐龙
花!
上一篇
下一篇