概述1 import requests 2 from selenium import webdriver 3 import time 4 5 def grasp(urlT): 6 driver = webdriver.Chrome(r‘C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe‘) #
自动化测
@H_404_14@
1 import requests 2 from selenium
import webdriver 3 import time 4 5 def grasp(urlT): 6 driver = webdriver.Chrome(r
‘C:\Program files (x86)\Google\Chrome\Application\chromedriver.exe‘)
#自动化测试程序工具本地所在地 7 resAll = []
#用于存储单条数据 8 rest = {}
#用于存储单个数据 9 res=
requests.get(urlT)10 for i
in range(0,29
):11 print(res.Json()[
‘data‘][i][
‘Title‘])12 try:13 print(res.Json()[
‘data‘][i][
‘newsTime‘])14 except:15 print(
‘None‘)16 print(res.Json()[
‘data‘][i][
‘source‘])17 print(res.Json()[
‘data‘][i][
‘url‘])18 rest[
‘Title‘]=res.Json()[
‘data‘][i][
‘Title‘]19 try:20 rest[
‘newsTime‘] = res.Json()[
‘data‘][i][
‘newsTime‘]21 except:22 rest[
‘newsTime‘] =
‘None‘23 rest[
‘source‘] = res.Json()[
‘data‘][i][
‘source‘]24 url = res.Json()[
‘data‘][i][
‘url‘]25 rest[
‘url‘] = res.Json()[
‘data‘][i][
‘url‘]26 try:27 driver.get(url)28 time.sleep(4
)29 contend = driver.find_element_by_class_name(
‘text-3zQ3cZD4‘).text30 rest[
‘contend‘] =
str(contend)31 print(contend)32 driver.back()33 time.sleep(6
)34 except:35 print(f
‘第{i}条新闻失败‘)36 print(
‘#-----------------------某些格式不符合------------------------#‘)37 resAll.append(rest)38 with open(
‘./news.txt‘,
‘a+‘,enCoding=
‘utf-8‘) as f:39 try:40 f.write(
‘‘.join(resAll[i].values())+
‘\n‘)41 except:42 print(
‘写入失败‘)43 44 url =
"https://shankAPI.ifeng.com/spring/finance/index/newInfoIndex/75219"45 grasp(url)46 47 48 class Grasp:49 50 def __init__(self):51 self.driver = webdriver.Chrome(r
‘C:\Program files (x86)\Google\Chrome\Application\chromedriver.exe‘)52 self.resAll = []
#用于存储单条数据53 self.rest = {}
#用于存储单个数据54 self.res = requests.get(
"https://shankAPI.ifeng.com/spring/finance/index/newInfoIndex/75219")
#目标链接55 56 def run(self):57 for i
in range(0,len(self.res.Json()[
‘data‘])):58 print(self.res.Json()[
‘data‘][i][
‘Title‘])
#输出标题59 try:60 print(self.res.Json()[
‘data‘][i][
‘newsTime‘])
#输出时间61 except:62 print(
‘None‘)63 print(self.res.Json()[
‘data‘][i][
‘source‘])
#输出来源64 print(self.res.Json()[
‘data‘][i][
‘url‘])
#输出链接地址65 self.rest[
‘Title‘] = self.res.Json()[
‘data‘][i][
‘Title‘]
#获取标题66 try:67 self.rest[
‘newsTime‘] = self.res.Json()[
‘data‘][i][
‘newsTime‘]
#获取时间68 except:69 self.rest[
‘newsTime‘] =
‘None‘70 self.rest[
‘source‘] = self.res.Json()[
‘data‘][i][
‘source‘]
#获取来源71 self.url = self.res.Json()[
‘data‘][i][
‘url‘]72 self.rest[
‘url‘] = self.res.Json()[
‘data‘][i][
‘url‘]
#获取链接地址73 try:74 self.driver.get(url)75 time.sleep(4
)76 self.contend = self.driver.find_element_by_class_name(
‘text-3zQ3cZD4‘).text
#获取网页标签下的文本77 self.rest[
‘contend‘] = str(self.contend)
#插入单条数据78 print(f
‘第{i}条新闻成功‘)79 self.driver.back()80 time.sleep(4
)81 except:82 print(f
‘第{i}条新闻失败‘)83 print(
‘#-----------------------某些格式不符合------------------------#‘)84 self.resAll.append(self.rest)85 with open(
‘./news.txt‘,enCoding=
‘utf-8‘) as f:86 try:87 f.write(f
‘第{i}条新闻开始‘)88 f.write(
‘‘.join(self.resAll[i].values()) +
‘\n‘)
#写入数据89 f.write(f
‘第{i}条新闻结束‘)90 except:91 print(
‘写入失败‘)92 93 g =
Grasp()94 g.run()
VIEw Code 有写规则需要自己定义判断。
希望,帮到大家
总结
以上是内存溢出为你收集整理的python爬取凤凰网站的新闻,及其链接地址,来源,时间和内容,用selenium自动化和requests处理数据全部内容,希望文章能够帮你解决python爬取凤凰网站的新闻,及其链接地址,来源,时间和内容,用selenium自动化和requests处理数据所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
评论列表(0条)