问题描述:原因分析:解决方案:
方法一:方法二:方法三:
代码一代码二代码三代码四代码五Test代码
# 项目场景: Python3.8 问题描述:
在使用Python爬虫爬取网页的列表页中的详情页时,返回的详情页的html文件的数据长度有限。
原因分析:
频繁爬取目标网站,导致的网址反爬虫措施
解决方案:
如果解决不了,你可以把要爬取网页的源码先保存下来,进行后续的处理。
换一个vpn,也就是换一台电脑执行程序
方法二:复制目标网页的Headers添加到代码中
根据目标情况不同修改
def askURL(url): head = { # 模拟浏览器头部信息,向豆瓣服务器发送消息 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,**;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.mafengwo.cn', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55', } url = 'http://www.mafengwo.cn/poi/5423409.html' # response = requests.get(url) # # cookie1 # cookie1 = response.cookies # # js代码 # js_code = response.text def get_521_content(url,head): req = requests.get(url, headers=head) cookies = req.cookies cookies = '; '.join(['='.join(item) for item in cookies.items()]) txt_521 = req.text txt_521 = ''.join(re.findall('', txt_521)) return (txt_521, cookies) def fixed_fun(function): func_return = function.replace('eval', 'return') content = execjs.compile(func_return) req = requests.get(url, headers=head) evaled_func = ''.join(re.findall('', req.text)) # print(js_con) # fn = js_con.split('=').split(' ') # evaled_func = content.call(fn) # print(evaled_func) mode_func = evaled_func.replace('while(window._phantom||window.__phantomas){};', ''). replace('document.cookie=', 'return').replace(';if((function(){try{return !!window.addEventListener;}', ''). replace("catch(e){return false;}})()){document.addEventListener('DOMContentLoaded',l,false);}", ''). replace("else{document.attachEvent('onreadystatechange',l);}", '').replace( r"setTimeout('location.href=location.href.replace(/[?|&]captcha-challenge/,'')',1500);", '') content = execjs.compile(mode_func) cookies = content.call('l') __jsl_clearance = cookies.split(';')[0] return __jsl_clearance def cookie_dict(js, id): dict = {} js = js.split('=') id = id.split('=') dict[js[0]] = js[1] dict[id[0]] = id[1] return dict if __name__ == '__main__': func = get_521_content(url,head) content = func[0] cookie_id = func[1] cookie_js = fixed_fun(func[0]) dicted_cookie = cookie_dict(cookie_js, cookie_id) head = { # 模拟浏览器头部信息,向豆瓣服务器发送消息 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,**;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.mafengwo.cn', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55', } def get_521_content(url): req = requests.get(url, headers=head, timeout=5) print(req.status_code, req.text) if req.status_code == 521: cookies = dict(req.cookies.items()) print(cookies) js_con = ''.join(re.findall('', req.text)) if js_con: __jsl_clearance = fixed_fun(js_con, url) if __jsl_clearance: key, value = __jsl_clearance.split('=') cookies[key] = value return cookies # 执行js代码获取cookies 的__jsl_clearance的键值 def fixed_fun(js_con, url): # js_con 第一次请求获取的js内容 func_return = js_con.replace('eval(', 'return(') print('第一次替换eval==》return后: ', func_return) content = execjs.compile(func_return) # fn = js_con.split('=')[0].split(' ')[1] # 只有['document.cookie'] fn = js_con.split('=')[0].split(' ')[1] evaled_func = content.call(fn) print('第一次执行js代码后: ', evaled_func) fn = evaled_func.split('=')[0].split(' ')[1] # 获取动态函数名 aa = evaled_func.split("") # 获取标签的内容 aa = aa[1].split("")[0] if len(aa) >= 2 else '' mode_func = evaled_func. replace( "setTimeout('location.href=location.pathname+location.search.replace(/[\?|&]captcha-challenge/,\'\')',1500);document.cookie=", 'return'). replace(';if((function(){try{return !!window.addEventListener;}', ''). replace( "}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")", ''). replace( "if((function(){try{return !!window.addEventListener;}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")", ''). replace("return'__jsl_clearance", "var window={};return '__jsl_clearance"). replace( "var " + fn + "=document.createElement('div');" + fn + ".innerHTML='" + aa + "';" + fn + "=" + fn + ".firstChild.href", "var " + fn + "='" + url + "'") print('第二次替换后的js代码:', mode_func) try: content = execjs.compile(mode_func) cookies = content.call(fn) __jsl_clearance = cookies.split(';')[0] print(__jsl_clearance) return __jsl_clearance except: print('js执行错误:', mode_func) return None # 携带解密后的cookies第二次爬取详情页 def con_spider(cookies, url): response = requests.get(url, headers=head, cookies=cookies, timeout=5) if response.status_code == 200: response.encoding = 'utf-8' print(response.status_code) print(response.text) return response else: print('第二次爬取错误状态码:', response.status_code) return None if __name__ == "__main__": cookies = get_521_content(url) con_spider(cookies, url)代码三
# resource:https://www.cnblogs.com/gongs/p/10524710.html import execjs import re import requests url = 'http://www.mafengwo.cn/poi/5423409.html' head = { # 模拟浏览器头部信息,向豆瓣服务器发送消息 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,**;q=0.8,application/signed-exchange;v=b3;q=0.9", # "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", # "Cache-Control": "max-age=0", # "Connection": "keep-alive", # "cookie": 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"www.mafengwo.cn";s:2:"ft";s:19:"2022-01-10+21:47:27";}; __jsluid_h=aa6e6e4350e2fd0e52cc227da10e26b5; __omc_chl=; __omc_r=; __mfwc=direct; uva=s:78:"a:3:{s:2:"lt";i:1641822448;s:10:"last_refer";s:6:"direct";s:5:"rhost";s:0:"";}";; __mfwurd=a:3:{s:6:"f_time";i:1641822448;s:9:"f_rdomain";s:0:"";s:6:"f_host";s:3:"www";}; __mfwuuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; UM_distinctid=17e443e711c512-05dd7ff73ec639-5e181552-144000-17e443e711dc58; login=mafengwo; mafengwo=16a582a6e0ca5f6c73654cb640343886_35627906_61e15d7be119c7.29366428_61e15d7be11a11.54996187; __jsl_clearance=1642341544.979|0|fafiHNHGZB+baEyxg5NVjPfVXm0=; PHPSESSID=s4foj9fhkm3mq8rs64omagvvp2; mfw_uid=35627906; __mfwa=1641822449293.40635.14.1642238623523.1642341546971; __mfwlv=1642341546; __mfwvn=10; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642215122,1642218839,1642238624,1642341547; CNZZDATA30065558=cnzz_eid=1067569765-1641819345-&ntime=1642337760; bottom_ad_status=0; uol_throttle=35627906; __mfwb=8cc49c72508e.10.direct; __mfwlt=1642343676; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642343676', # "Host": "www.mafengwo.cn", # "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55", } def getResponse(): """ 获取response :return: """ response = requests.get(url1, headers=head) return response def getJslid(response): """ :param response: :return: """ cook = response.cookies return '; '.join(['='.join(item) for item in cook.items()]) def getClearance(response): """ :return: """ txt = ''.join(re.findall('', response.text)) func_return = txt.replace('eval', 'return') print(func_return) content = execjs.compile(func_return) print(type(content)) # content = open("jsdom_document").read() # print(content) # execjs._exceptions.ProgramError: ReferenceError: document is not defined eval_func = content.call('x') name = re.findall(r'var (.*?)=function.*', eval_func)[0] mode_func = eval_func.replace('while(window._phantom||window.__phantomas){};', ''). replace('document.cookie=', 'return').replace('if((function(){try{return !!window.addEventListener;}', ''). replace("catch(e){return false;}})()){document.addEventListener('DOMContentLoaded',%s,false)}" % name, ''). replace("else{document.attachEvent('onreadystatechange',%s)}" % name, '').replace( r"setTimeout('location.href=location.pathname+location.search.replace(/[?|&]captcha-challenge/,'')',1500);", '') content = execjs.compile(mode_func) cookies = content.call(name) # print(cookies) clearance = cookies.split(';')[0] return clearance def structurecookie(cook, clearance): """ 构造新的headers :return: """ cookie = cook + ';' + clearance print(cookie) return cookie if __name__ == '__main__': response = getResponse() clearance = getClearance(response) cook = getJslid(response) head = { # 模拟浏览器头部信息,向豆瓣服务器发送消息 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,**;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "Cache-Control": "max-age=0", "Connection": "keep-alive", "cookie": 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"www.mafengwo.cn";s:2:"ft";s:19:"2022-01-10+21:47:27";}; __jsluid_h=aa6e6e4350e2fd0e52cc227da10e26b5; __omc_chl=; __omc_r=; __mfwc=direct; uva=s:78:"a:3:{s:2:"lt";i:1641822448;s:10:"last_refer";s:6:"direct";s:5:"rhost";s:0:"";}";; __mfwurd=a:3:{s:6:"f_time";i:1641822448;s:9:"f_rdomain";s:0:"";s:6:"f_host";s:3:"www";}; __mfwuuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; UM_distinctid=17e443e711c512-05dd7ff73ec639-5e181552-144000-17e443e711dc58; login=mafengwo; mafengwo=16a582a6e0ca5f6c73654cb640343886_35627906_61e15d7be119c7.29366428_61e15d7be11a11.54996187; __jsl_clearance=1642341544.979|0|fafiHNHGZB+baEyxg5NVjPfVXm0=; PHPSESSID=s4foj9fhkm3mq8rs64omagvvp2; mfw_uid=35627906; __mfwa=1641822449293.40635.14.1642238623523.1642341546971; __mfwlv=1642341546; __mfwvn=10; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642215122,1642218839,1642238624,1642341547; CNZZDATA30065558=cnzz_eid=1067569765-1641819345-&ntime=1642337760; bottom_ad_status=0; uol_throttle=35627906; __mfwb=8cc49c72508e.10.direct; __mfwlt=1642343676; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642343676', "Host": "www.mafengwo.cn", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55", } # # 输出访问网页的状态码 # req = requests.get(url, headers=head).status_code # print(req) request = urllib.request.Request(url1, headers=head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode(encoding="utf-8", errors="ignore") print(html) except urllib.error.URLError as e: if hasattr(e, "code"): print("状态码:%s"%(e.code)) if hasattr(e, "reason"): print("原因:%s"%(e.reason)) # response = requests.get(url1) # print(response) # # cookie1 # cookie1 = response.cookies # print(cookie1) # # js代码 # js_code = response.text # print(js_code)
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)