python批量爬取域名备案 – 记忆角落

python批量爬取域名备案

/ 0评 / 2

python批量爬取域名备案小工具--接口是站长之家

import re, requests, time, xlrd, random, csv
from lxml import etree

domain_list = []
# workbook = xlrd.open_workbook('未爬.xls')
# sheet0 = workbook.sheet_by_index(0)
# cols = sheet0.col_slice(0,1)
# for col in cols:
#     domain_list.append(col.value)

with open('111.txt',encoding='utf-8',newline='')as fk:
    for i in fk:
        domain = re.sub(r'\r\n$','',i)
        domain_list.append(domain)

user_Agent_list = [
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Opera/9.80 (Windows NT 10.0; U; zh-cn) Presto/2.9.168 Version/11.50",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36"
]

a = 0
url = 'http://icp.chinaz.com/searchs'
with open('test1.csv','a',newline='')as fk:
    writer = csv.writer(fk)
    csv_header = ['域名','主办单位名称','单位性质','网站备案/许可证号','网站名称','审核时间']
    writer.writerow(csv_header)
    for num in range(len(domain_list)//20-a//20+1):
        local_time = int(time.time())
        Cookie = 'UM_distinctid=176dabed56b80a-080978cc77e73-376b4502-1fa400-176dabed56c952; __guid=31546918.3636803744203470000.1610026817897.5715; __gads=ID=2c6886bd82b7280d-2280524a92c500e4:T=1610026898:RT=1610026898:S=ALNI_MbyH5W7HulKDaCwCpZdoBvNfCv5fg; toolbox_urls=www.ccbechina.cn|www.apple.iducs.cn; qHistory=aHR0cDovL3dob2lzLmNoaW5hei5jb20vK1dob2lz5p+l6K+ifGh0dHA6Ly90b29sLmNoaW5hei5jb21f56uZ6ZW/5bel5YW3fGh0dHA6Ly9yYW5rLmNoaW5hei5jb20vcmFua2FsbC9f5p2D6YeN57u85ZCI5p+l6K+ifGh0dHA6Ly9zZW8uY2hpbmF6LmNvbV9TRU/nu7zlkIjmn6Xor6J8aHR0cDovL3Rvb2wuY2hpbmF6LmNvbS9kbnMvX0Ruc+afpeivog==; Hm_lvt_aecc9715b0f5d5f7f34fba48a3c511d6=1614822213,1614822577,1614822679,1614823108; CNZZDATA5082706=cnzz_eid%3D817694234-1610023182-%26ntime%3D1615443601; Hm_lvt_ca96c3507ee04e182fb6d097cb2a1a4c=1614158602,1614757255,1615271947,1615445490; .AspNetCore.Antiforgery.-Z5WMyCX4K0=CfDJ8GYV1qq4FPhNvMPl1WmHHp4TJ3UkqvXhcEpr97APD9DsO6WWhvHhS1Ur7lynrac2voNvP_6CKjUNJ7GfxR0Y8Hvzh5CzvnqGj1zDfZqI-uOal1z6njEfqbHpCtZTRIzvCbnZaH2ylkWKvQ8HYwdptNY; bbsmax_user=a3edcb67-4488-4058-ac01-ec29ef25c2d3; avatarId=14f364b2-af95-4174-a58f-b95415e9ad1c-; .AspNetCore.Session=CfDJ8GYV1qq4FPhNvMPl1WmHHp7xuHiscEHg0wvC5lfyLapGtl8WJLhWLk%2BrvYNFAM9CCKxQ6RnvIiEgR86UrCH%2FavX838WgaU%2BU%2FbXLz305a97b8qWufBUMzpsSVhMKMMBzfgT%2F9LSOKw1wTwgn9ND7ySHWDWbXg%2BXyPKaPVWG7dFdm; monitor_count=4; Hm_lpvt_ca96c3507ee04e182fb6d097cb2a1a4c={0}'.format(local_time)
        headers = {
            'User-Agent': random.choice(user_Agent_list),
            'Cookie':Cookie,
            'Host':'icp.chinaz.com',
            'Referer':'http://icp.chinaz.com/web'
        }
        form_data = {
            'hosts':','.join(domain_list[a:a+20])
        }

        res = requests.post(url,headers=headers,data=form_data)
        # print(res.content.decode('utf-8'))
        html = etree.HTML(res.content.decode('utf-8'))
        trs = html.xpath('//tbody[@id="result_table"]/tr')
        for tr in trs:
            domain = tr.xpath('./td[1]//text()')[0]
            organizer = tr.xpath('./td[2]//text()')[0]
            unit_nature = tr.xpath('./td[3]//text()')[0]
            passcord = tr.xpath('./td[4]//text()')[0]
            website_name = tr.xpath('./td[5]//text()')[0]
            audit_time = tr.xpath('./td[8]//text()')[0]
            csv_content = [domain,organizer,unit_nature,passcord,website_name,audit_time]
            print(csv_content)
            writer.writerow(csv_content)
        a += 20
        time.sleep(1)
        print('现在是第%s个'% a)
        # print(form_data)

 

发表评论

您的电子邮箱地址不会被公开。 必填项已用*标注