求求大佬帮忙啊
import requests from lxml import etree def get_paper_link(ur1): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29'} res1 = requests.get(ur1, headers=headers).text # print(res1) # 然后找文章链接,引入解析包,使用lxml,然后使用xpath来查找目标doi html_xpath = etree.HTML(res1) paper_links = html_xpath.xpath('//h3[@]/a/@href') # 找到了文章的链接 doi_list = [] # 将doi存放到这个定义的列表中 for link in paper_links: paper_link = link print(paper_link) # print(paper_link) # 然后请求下面的网址,提取doi res2 = requests.get(paper_link, headers=headers).text html_xpath2 = etree.HTML(res2) try: # 为没有doi的文章进行错误过滤 paper_doi = html_xpath2.xpath('//div[@]/p[@]/text()')[0].strip() # 提取doi列表中的元素 if str(10) in paper_doi: doi_list.append(paper_doi) # break # 先查看一个doi except: pass return doi_list # 存入成功 ur1 = 'https://xueshu.baidu.com/s?wd=%E6%B0%B4%E4%B8%8B%E5%9B%BE%E5%83%8F%E5%A4%8D%E5%8E%9F&' \ 'rsv_bp=0&tn=SE_baiduxueshu_c1gjeupa&rsv_spt=3&ie=utf-8&f=8&rsv_sug2=1&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D'\ get_paper_link(ur1)
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)