''' python 词典网历史人物词库 线程池 爬取 https://search.cidianwang.com/ 待处理人名文件下载地址:https://jhc001.lanzouw.com/iYkEqwj0o9e 密码:b0dh 注:数据输出格式并未处理,少量数据解析格式存在问题 本文章仅供参考交流学习,如做其他用途,产生一切后果请自负。 ''' #coding=utf-8 #coding=gbk from concurrent.futures import ThreadPoolExecutor import requests,os,re,asyncio,time import threadpool from lxml import etree def spider(name): cookies = { 'cookies'换成自己的 } headers = { 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"', 'sec-ch-ua-mobile': '?0', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Sec-Fetch-Site': 'same-site', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-User': '?1', 'Sec-Fetch-Dest': 'document', 'Referer': 'https://www.cidianwang.com/', 'Accept-Language': 'zh-CN,zh;q=0.9', } params = ( ('q', name), ('m', '12'), ('y', '0'), ) response = requests.get('https://search.cidianwang.com/', headers=headers, params=params, cookies=cookies) response.encoding = 'utf8' lis=re.findall('">,response.text) if lis != []: r_lis = lis[0] detail_page2(name, r_lis) print(name+' --> 搜索成功') else: print(name+' --> 搜索结果为空') # 词典解析 def detail_page2(name,r_lis): response=requests.get(r_lis) response.encoding='utf8' # print(response.text) tree=etree.HTML(response.text) lis=tree.xpath('//div[@]/div[@id="left"]/div/p[2]//text()') if lis != []: detail=str(lis) save_data(name, detail) else: print('空') # 保存数据 def save_data(name,detail): with open('../results/results1.txt', 'a', encoding='utf-8') as fp: result=name+'t : '+detail+'n' fp.write(str(result).replace(',','').replace("'",'').replace('[','').replace(']','').replace('nt','')) if __name__ == '__main__': os.makedirs('../results', exist_ok=True) try: with open('word1.txt', 'r', encoding='utf-8') as fp: words = fp.readlines() # 线程池 with ThreadPoolExecutor(50) as executor1: executor1.map(spider,words) except: x = input('执行异常!!!')
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)