2021-03-10 Python 批量下载文献PDF

2021-03-10 Python 批量下载文献PDF,第1张

# -*- Coding: utf-8 -*-"""Created on  Mar  10 21:22:22 2021@author: kimol_love & solar2030>>>>>>>> This code is designed based on kimol_love's code in his blog, https://blog.csdn.net/kimol_justdo/article/details/112996678?spm=1001.2014.3001.5501     Say thanks to him. Here, a 【for】 command was used so that we can downloading a serIEs of papers by on-click. All we need to prepare is a text file including the Lists of paper Titles. And at the same time, I solved a BUG related to '/' in paper Titles. It can trouble troubles because '/' cannot be used in filenames. Using 【str.replace】command, we can replace '/'s with '_', for example, the BUG then can be fixed.>>>>>>>> """import osimport timeimport requestsfrom bs4 import BeautifulSoupfrom tkinter.filedialog import askopenfilenameimport matplotlib.pyplot as pltimport numpy as nppath_and_name = askopenfilename(Title='Paper Lists: Title or doi', filetypes=[('TXT', '*.txt')],                                initialdir='D:\')data = []# Data loading process:# I. enumerate sequence,start from 0, rows stands for elements in Lists.# II. open txt,read data# III. data lines:In CHI760E txt format, first data appears at 31th line.with open(path_and_name)as txt_file:    line = txt_file.readlines()    for i, rows in enumerate(line):        if i in range(0, len(line)):            data.append(rows)print(data[0])def search_article(artname):    '''    搜索论文    ---------------    输入:论文名    ---------------    输出:搜索结果(如果没有返回"",否则返回pdf链接)    '''    url = 'https://www.sci-hub.ren/'    headers = {'User-Agent': 'Mozilla/5.0 (windows NT 6.3; Win64; x64; rv:84.0) Gecko/20100101 firefox/84.0',               'Accept': 'text/HTML,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',               'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',               'Accept-EnCoding': 'gzip, deflate, br',               'Content-Type': 'application/x-www-form-urlencoded',               'Content-Length': '123',               'Origin': 'https://www.sci-hub.ren',               'Connection': 'keep-alive',               'upgrade-insecure-requests': '1'}    data = {'sci-hub-plugin-check': '',            'request': artname}    res = requests.post(url, headers=headers, data=data)    HTML = res.text    soup = BeautifulSoup(HTML, 'HTML.parser')    iframe = soup.find(ID='pdf')    if iframe == None:  # 未找到相应文章        return ''    else:        downUrl = iframe['src']        if 'http' not in downUrl:            downUrl = 'https:' + downUrl        return downUrldef download_article(downUrl):    '''    根据论文链接下载文章    ----------------------    输入:论文链接    ----------------------    输出:pdf文件二进制    '''    headers = {'User-Agent': 'Mozilla/5.0 (windows NT 6.3; Win64; x64; rv:84.0) Gecko/20100101 firefox/84.0',               'Accept': 'text/HTML,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',               'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',               'Accept-EnCoding': 'gzip, deflate, br',               'Connection': 'keep-alive',               'upgrade-insecure-requests': '1'}    res = requests.get(downUrl, headers=headers)    return res.contentdef welcome():    '''    欢迎界面    '''    os.system('cls')    Title = '''               _____  _____ _____      _    _ _    _ ____              / ____|/ ____|_   _|    | |  | | |  | |  _ \             | (___ | |      | |______| |__| | |  | | |_) |              \___ \| |      | |______|  __  | |  | |  _ <              ____) | |____ _| |_     | |  | | |__| | |_) |             |_____/ \_____|_____|    |_|  |_|\____/|____/            '''    print(Title)if __name__ == '__main__':#    while True:     I=[]     for ii in range(len(data)):        welcome()        #request = input('请输入URL、PMID、DOI或者论文标题:')        request = data[ii].strip()        Title=request.replace("/", "_")        print('搜索中...')        downUrl = search_article(request)        if downUrl == '':            print('未找到相关论文,请重新搜索!')            I.append('0')        else:            print('论文链接:%s' % downUrl)            print('下载中...')            pdf = download_article(downUrl)            #文献存储目录   D:\doc_E\papers\            with open('D:\doc_E\papers\%s.pdf' % Title, 'wb') as f:                f.write(pdf)            print('---下载完成---')            I.append('1')        time.sleep(0.8)     print('下载完成统计: %s', I)

以上是内存溢出为你收集整理的2021-03-10 Python 批量下载文献PDF全部内容,希望文章能够帮你解决2021-03-10 Python 批量下载文献PDF所遇到的程序开发问题。



原文地址: http://www.outofmemory.cn/langs/1188788.html

打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-06-03
下一篇 2022-06-03



