selenium爬虫操作网页(实战篇)
前面我们遇到了一个爬虫难题:爬虫最怕遇到JavaScript依赖性的动态网页,选择了[在R里面配置selenium爬虫环境](),仅仅是安装和配置好了在R里面使用selenium爬虫,打开一个JavaScript控制的动态网页仅仅是爬虫的开始,接下来需要跟这个网页进行各式各样的交互。首先放出一些学习链接:
https://ropensci.org/tutorials/rselenium_tutorial/ http://thatdatatho.com/2019/01/22/tutorial-web-scraping-rselenium/ https://www.selenium.dev/
document.write('<script src="../../js/common/base.js?v='+new Date().getTime()+'" type="text/javascript" charset="utf-8"><\/script>');
document.write('<script src="../../js/util/jump.js?v='+new Date().getTime()+'" type="text/javascript" charset="utf-8"><\/script>');
document.write('<script src="../../js/common/common_methods.js?v='+new Date().getTime()+'" type="text/javascript" charset="utf-8"><\/script>');
document.write('<script src="../../js/common/parts.js?v='+new Date().getTime()+'" type="text/javascript" charset="utf-8"><\/script>');
document.write('<script src="../../js/plasmid/plasmid_list.js?v='+new Date().getTime()+'" type="text/javascript" charset="utf-8"><\/script>');
document.write('<script src="../../js/plasmid/plasmid_list_mobile.js?v='+new Date().getTime()+'" type="text/javascript" charset="utf-8"><\/script>');
</script>
library(rvest)
library(stringr)
################调用R包#########################################
library(rvest) # 为了read_html函数
library(RSelenium) # 为了使用JavaScript进行网页抓取
###############连接Server并打开浏览器############################
remDr <- remoteDriver(remoteServerAddr = "127.0.0.1"
, port = 4444
, browserName = "chrome")#连接Server
remDr$open() #打开浏览器
remDr$navigate("http://www.brics.ac.cn/plasmid/template/plasmid/plasmid_list.html") #打开网页
<a href="javascript:;" class="layui-laypage-prev layui-disabled" data-page="0">上一页</a>
<span class="layui-laypage-curr"><em class="layui-laypage-em"></em><em>1</em></span>
<a href="javascript:;" data-page="2">2</a><span class="layui-laypage-spr">…</span>
<a href="javascript:;" class="layui-laypage-last" title="尾页" data-page="3396">3396</a>
<a href="javascript:;" class="layui-laypage-next" data-page="2">下一页</a></div></div>
# 解析上面的html源代码
webElem <- remDr$findElement(using = 'class', value = "layui-laypage-next")
webElem$clickElement()
links=remDr$getPageSource()[[1]] %>%
read_html() %>%
html_nodes("a") %>% html_attr("href")
links
i=1
while(i<3396){
i=i+1
webElem <- remDr$findElement(using = 'class', value = "layui-laypage-next")
webElem$clickElement()
lks=remDr$getPageSource()[[1]] %>%
read_html() %>%
html_nodes("a") %>% html_attr("href")
print(lks)
links=c(links,lks)
}
links=unique(links)
links
save(links,file = 'plasmid_detail_links.Rdata')
links
kp=grepl('plasmid_detail.html',links)
links=links[kp]
length(links)
remDr <- remoteDriver(remoteServerAddr = "127.0.0.1"
, port = 4444
, browserName = "chrome")#连接Server
remDr$open() #打开浏览器
for (i in 1:5000) {
print(i)
url=links[i]
remDr$navigate(url) #打开网页
Sys.sleep(0.5)
print(remDr$getCurrentUrl())
htmls=remDr$getPageSource()[[1]]
# <div class="panel-body">
bodys <- htmls %>%
read_html() %>% html_nodes('.panel-body')
c1 <- bodys[[1]] %>% html_text()
c2 <- bodys[[2]] %>% html_text()
c3 <- bodys[[3]] %>% html_text()
c1=gsub('\t','',c1);c1=gsub('\n','',c1);
c2=gsub('\t','',c2);c2=gsub('\n','',c2);
c3=gsub('\t','',c3);c3=gsub('\n','',c3);
# id="plasmidName"
plasmidName <- htmls %>%
read_html() %>% html_nodes('#plasmidName') %>% html_text()
# id="plasmid_identification"
plasmid_identification <- htmls %>%
read_html() %>% html_nodes('#plasmid_identification') %>% html_text()
info=data.frame(plasmidName,plasmid_identification,c1,c2,c3)
rm(htmls)
write.table(info,file = 'info1.txt',
col.names = F,row.names = F,
append = T)
}
https://www.jianshu.com/p/e5b252c90e0d https://blog.csdn.net/qq_33291559/article/details/80028119 https://www.jianshu.com/p/1fc6a6817160
css selector手册:https://www.runoob.com/cssref/css-selectors.html xpath selector手册:https://www.runoob.com/xpath/xpath-tutorial.html xpath查找节点:https://www.cnblogs.com/txwen/p/7999485.html 关于GET和POST的有趣的解释:https://zhuanlan.zhihu.com/p/22536382 RCurl解析:https://blog.csdn.net/kMD8d5R/article/details/78933384 html文件及http基本知识:https://www.w3school.com.cn/tags/html_ref_byfunc.asp post/get格式化工具:http://coolaf.com rvest模拟浏览行为:https://blog.csdn.net/weixu22/article/details/79237512 rvest模拟点击网页:https://www.jb51.cc/html/224799.html
文末友情宣传
生信爆款入门-全球听(买一得五)(第5期)(可能是最后一期)你的生物信息学入门课 (必看!)数据挖掘第3期(两天变三周,实力加量),医学生/临床医师首选技能提高课 生信技能树的2019年终总结 ,你的生物信息学成长宝藏 2020学习主旋律,B站74小时免费教学视频为你领路,还等什么,看啊!!!
赞 (0)