python批量爬取域名备案小工具--接口是站长之家
import re, requests, time, xlrd, random, csv from lxml import etree domain_list = [] # workbook = xlrd.open_workbook('未爬.xls') # sheet0 = workbook.sheet_by_index(0) # cols = sheet0.col_slice(0,1) # for col in cols: # domain_list.append(col.value) with open('111.txt',encoding='utf-8',newline='')as fk: for i in fk: domain = re.sub(r'\r\n$','',i) domain_list.append(domain) user_Agent_list = [ "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Opera/9.80 (Windows NT 10.0; U; zh-cn) Presto/2.9.168 Version/11.50", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36" ] a = 0 url = 'http://icp.chinaz.com/searchs' with open('test1.csv','a',newline='')as fk: writer = csv.writer(fk) csv_header = ['域名','主办单位名称','单位性质','网站备案/许可证号','网站名称','审核时间'] writer.writerow(csv_header) for num in range(len(domain_list)//20-a//20+1): local_time = int(time.time()) Cookie = 'UM_distinctid=176dabed56b80a-080978cc77e73-376b4502-1fa400-176dabed56c952; __guid=31546918.3636803744203470000.1610026817897.5715; __gads=ID=2c6886bd82b7280d-2280524a92c500e4:T=1610026898:RT=1610026898:S=ALNI_MbyH5W7HulKDaCwCpZdoBvNfCv5fg; toolbox_urls=www.ccbechina.cn|www.apple.iducs.cn; qHistory=aHR0cDovL3dob2lzLmNoaW5hei5jb20vK1dob2lz5p+l6K+ifGh0dHA6Ly90b29sLmNoaW5hei5jb21f56uZ6ZW/5bel5YW3fGh0dHA6Ly9yYW5rLmNoaW5hei5jb20vcmFua2FsbC9f5p2D6YeN57u85ZCI5p+l6K+ifGh0dHA6Ly9zZW8uY2hpbmF6LmNvbV9TRU/nu7zlkIjmn6Xor6J8aHR0cDovL3Rvb2wuY2hpbmF6LmNvbS9kbnMvX0Ruc+afpeivog==; Hm_lvt_aecc9715b0f5d5f7f34fba48a3c511d6=1614822213,1614822577,1614822679,1614823108; CNZZDATA5082706=cnzz_eid%3D817694234-1610023182-%26ntime%3D1615443601; Hm_lvt_ca96c3507ee04e182fb6d097cb2a1a4c=1614158602,1614757255,1615271947,1615445490; .AspNetCore.Antiforgery.-Z5WMyCX4K0=CfDJ8GYV1qq4FPhNvMPl1WmHHp4TJ3UkqvXhcEpr97APD9DsO6WWhvHhS1Ur7lynrac2voNvP_6CKjUNJ7GfxR0Y8Hvzh5CzvnqGj1zDfZqI-uOal1z6njEfqbHpCtZTRIzvCbnZaH2ylkWKvQ8HYwdptNY; bbsmax_user=a3edcb67-4488-4058-ac01-ec29ef25c2d3; avatarId=14f364b2-af95-4174-a58f-b95415e9ad1c-; .AspNetCore.Session=CfDJ8GYV1qq4FPhNvMPl1WmHHp7xuHiscEHg0wvC5lfyLapGtl8WJLhWLk%2BrvYNFAM9CCKxQ6RnvIiEgR86UrCH%2FavX838WgaU%2BU%2FbXLz305a97b8qWufBUMzpsSVhMKMMBzfgT%2F9LSOKw1wTwgn9ND7ySHWDWbXg%2BXyPKaPVWG7dFdm; monitor_count=4; Hm_lpvt_ca96c3507ee04e182fb6d097cb2a1a4c={0}'.format(local_time) headers = { 'User-Agent': random.choice(user_Agent_list), 'Cookie':Cookie, 'Host':'icp.chinaz.com', 'Referer':'http://icp.chinaz.com/web' } form_data = { 'hosts':','.join(domain_list[a:a+20]) } res = requests.post(url,headers=headers,data=form_data) # print(res.content.decode('utf-8')) html = etree.HTML(res.content.decode('utf-8')) trs = html.xpath('//tbody[@id="result_table"]/tr') for tr in trs: domain = tr.xpath('./td[1]//text()')[0] organizer = tr.xpath('./td[2]//text()')[0] unit_nature = tr.xpath('./td[3]//text()')[0] passcord = tr.xpath('./td[4]//text()')[0] website_name = tr.xpath('./td[5]//text()')[0] audit_time = tr.xpath('./td[8]//text()')[0] csv_content = [domain,organizer,unit_nature,passcord,website_name,audit_time] print(csv_content) writer.writerow(csv_content) a += 20 time.sleep(1) print('现在是第%s个'% a) # print(form_data)