爬取和保存豆瓣小组图片

爬取和保存豆瓣小组图片,第1张

概述爬取和保存豆瓣小组图片

下面是内存溢出 jb51.cc 通过网络收集整理的代码片段。

内存溢出小编现在分享给大家,也给大家做个参考。

#!/usr/bin/env python3#-*- Coding=utf-8 -*-import requestsimport timeimport randomimport reimport configparserimport loggingimport logging.handlersimport lxml.etree as etreeimport threadingimport queueimport os.pathDOUBAN_headerS = {'Accept':'text/HTML,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','Referer':'http://www.douban.com/search?cat=1019&q=%E5%AE%B3%E7%BE%9E','Accept-Language':'zh-CN,zh;q=0.8','User-Agent':'Mozilla/5.0 (windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/38.0.2125.104 Safari/537.36','Accept-EnCoding':'gzip,deflate','Host':'www.douban.com','Connection':'Keep-Alive'}IMAGE_headerS = {'Accept':'text/HTML,like Gecko) Chrome/38.0.2125.104 Safari/537.36'}CNFG_file = 'douban_crawler.cfg'LOG_file = 'douban_crawler.log'MAX_LOG_SIZE = 1024 * 1024 #1MBLOG_BACKUP_COUNT = 3logger = logging.getLogger('crawler')logger.setLevel(logging.DEBUG)fh = logging.handlers.RotatingfileHandler(LOG_file,maxBytes=MAX_LOG_SIZE,backupCount=LOG_BACKUP_COUNT,enCoding='utf-8')fh.setLevel(logging.DEBUG)ch = logging.StreamHandler()ch.setLevel(logging.INFO)formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(lineno)d - %(message)s")fh.setFormatter(formatter)ch.setFormatter(formatter)# add the handlers to loggerlogger.addHandler(fh)logger.addHandler(ch)DEBUG = logger.deBUGINFO = logger.infoWARNING = logger.warningERROR = logger.errorclass Parser_Douban_Group(threading.Thread):    def __init__(self,url,queue,t_name = 'Parser Group'):        threading.Thread.__init__(self,name=t_name)        self.data = queue        self.url = url        self.s = requests.Session()    def run(self):        #解析网页        INFO("{0} started!".format(self.getname()))        co = 0        htm = open_douban_page(self.url,self.s)        try:            parser = etree.HTMLParser(recover=True)            text_dom = etree.fromstring(htm,parser)        except Exception as e:            ERROR('Parse douban page error: {0}'.format(e))            #DEBUG('Page: {0}'.format(htm))        else:            group_name = ''.join(text_dom.xpath("//div[@ID='group-info']/h1//text()")).strip()            INFO('Group name: {0}'.format(group_name))            div_node = text_dom.xpath("//tr[@class='']")                        for x in div_node:                co = co + 1                item = {}                url = ''.join(x.xpath("child::td[@class='Title']/a/attribute::href"))                Title = ''.join(x.xpath("child::td[@class='Title']/a//text()"))                auth = ''.join(x.xpath("child::td[@nowrap='nowrap']/a[@class='']//text()"))                reply = ''.join(x.xpath("child::td[@class='']//text()"))                time = ''.join(x.xpath("child::td[@class='time']//text()"))                item['Title'] = Title                item['url'] = url                item['auth'] = auth                item['reply'] = reply                item['time'] = time                #将数据依次存入队列                self.data.put(item,block=True)                DEBUG('{0} Put({1}) - ({2} ...)'.format(self.getname(),co,item['Title'][:20]))        #存入结束标志        self.data.put({})        INFO("{0} finished! put {1} topic to queue.".format(self.getname(),co))class Parser_Douban_topic(threading.Thread):    def __init__(self,topic_queue,content_queue,t_name = 'Parser topic'):        threading.Thread.__init__(self,name=t_name)        self.topic_queue = topic_queue        self.content_queue = content_queue        self.s = requests.Session()    def run(self):        #解析网页        INFO("{0} started!".format(self.getname()))        co = 0        coo = 0        while True:            try:                #读取队列,最长等待5分钟                val = self.topic_queue.get(True,300)                if val:                    co = co + 1                    DEBUG('{0} Get({1}) - ({2} ...)'.format(self.getname(),val['Title'][:20]))                    htm = open_douban_page(val['url'],self.s)                    try:                        parser = etree.HTMLParser(recover=True)                        text_dom = etree.fromstring(htm,parser)                    except Exception as e:                        ERROR('Parse douban page error: {0}'.format(e))                        #DEBUG('Page: {0}'.format(htm))                    else:                        topic_name = ''.join(text_dom.xpath("//div[@ID='content']/h1//text()")).replace('\n','').strip()                        DEBUG('topic name: {0}'.format(topic_name))                        div_node = text_dom.xpath("//div[@class='topic-content']")                        img_List = div_node[0].xpath("descendant::img/attribute::src")                        for x in img_List:                            coo = coo + 1                            item = {}                            #url = ''.join(x.xpath("descendant::img/attribute::src"))                            item['Title'] = topic_name + str(coo)                            item['url'] = x                            #将数据依次存入队列                            self.content_queue.put(item)                            DEBUG('{0} Put({1}) - ({2} ...)'.format(self.getname(),coo,item['Title'][:20]))                else:                    self.topic_queue.put({})                    INFO("{0} finished! get {1} topic from queue.".format(self.getname(),co))                    break            except Exception as e:                ERROR("{0} timeout! {1}".format(self.getname(),e))                break        #存入结束标志        self.content_queue.put({})        INFO("{0} finished! put {1} image to queue.".format(self.getname(),coo))class Save_Douban_Group(threading.Thread):    def __init__(self,folder_name = 'image',t_name = 'Storage'):        threading.Thread.__init__(self,name=t_name)        self.data = queue        self.folder = folder_name        self.s = requests.Session()    def run(self):        INFO("{0} started!".format(self.getname()))        co = 0        coo = 0        while True:            try:                #读取队列,最长等待5分钟                val = self.data.get(True,300)                if val:                    co = co + 1                    #fp.write('<{0}>.{1} - {2}\r{3}\r{4}\r\n'.format(                    #co,val['Title'],val['time'],val['url'],val['abr']))                    DEBUG('{0} Get({1}) - ({2} ...)'.format(self.getname(),val['Title'][:20]))                    img_dt = open_douban_page(val['url'],self.s,ret_raw = True)                    img_nm = val['url'].split('/')[-1]                    if img_dt:                        fn = '{0}/{1}'.format(self.folder,img_nm)                        if not os.path.exists(fn):                            fp = open(fn,'wb')                            fp.write(img_dt)                            fp.close()                            coo = coo + 1                else:                    self.data.put({})    #仍然存入结束标识                    break            except Exception as e:                ERROR("{0} timeout! {1}".format(self.getname(),e))                #break        #fp.close()        INFO("{0} finished! save image({1}/{2}).".format(self.getname(),co))def open_douban_page(group_url,s,retrIEs=3,ret_raw = False):    #读取网页    ret = ''    try:        cookies = dict(bID="RmFNKKPAd0s")        if ret_raw:            r = s.get(group_url,headers=IMAGE_headerS,stream=True)        else:            r = s.get(group_url,headers=DOUBAN_headerS,cookies=cookies)        #print(r.cookies)        r.raise_for_status()        time.sleep(random.uniform(0.3,1.5))    except requests.ConnectionError as e:        ERROR('Connect douban error({0}): {1}'.format(retrIEs,e))        retrIEs = retrIEs - 1        if retrIEs > 0:            time.sleep(0.5)            ret = open_douban_page(group_url,retrIEs)    except Exception as e:        ERROR('Open douban url({0}) error: {1}'.format(group_url,e))    else:        #INFO('Open douban page finished! - {0}'.format(r.url))        DEBUG('Request url: {0}'.format(group_url))        if ret_raw:            ret = r.raw.read()        else:            ret = r.text    return retdef crawler_douban(group_url,folder_name,task_name):    q_topic = queue.Queue()    q_content = queue.Queue()        parser_group_obj = []    parser_topic_obj = []    storage_pic_obj = []        for i in range(1,2):        parser_group_obj.append(Parser_Douban_Group(group_url,q_topic,'{0} {1}'.format(task_name,i)))        for i in range(1,2):        parser_topic_obj.append(Parser_Douban_topic(q_topic,q_content,'Parser topic {0}'.format(i)))        for i in range(1,3):        storage_pic_obj.append(Save_Douban_Group(q_content,'Storage {0}'.format(i)))        for obj in parser_group_obj:        obj.start()        for obj in parser_topic_obj:        obj.start()        for obj in storage_pic_obj:        obj.start()        for obj in parser_group_obj:        obj.join()        for obj in parser_topic_obj:        obj.join()        for obj in storage_pic_obj:        obj.join()    del q_topic    del q_contentif __name__ == '__main__':    haixiu_hangzhou_url = 'http://www.douban.com/group/505137/'    haixiu_url = 'http://www.douban.com/group/haixiuzu/'    co =0    while True:        co = co + 1        time.sleep(2.0)        crawler_douban(haixiu_url,'image','Parser HaiXiu Group ({0})'.format(co))    input('Press any key to exit!')

以上是内存溢出(jb51.cc)为你收集整理的全部代码内容,希望文章能够帮你解决所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

总结

以上是内存溢出为你收集整理的爬取和保存豆瓣小组图片全部内容,希望文章能够帮你解决爬取和保存豆瓣小组图片所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

欢迎分享,转载请注明来源:内存溢出

原文地址: http://www.outofmemory.cn/langs/1199340.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-06-04
下一篇 2022-06-04

发表评论

登录后才能评论

评论列表(0条)

保存