帮我把Microbiome到2020年5月19日的全部文章的pdf文件爬下来

写在前面

520来了,作为科研人,我也没有什么好送给大家的 ,今天赵向阳将Microbiome全部发表的文章送给大家。目前Microbiome已经突破十分,今年的影响因子预测为12左右。文章质量自不必说,创刊三年以来优秀非常。所以今天这里400多篇micirobiome文章送给你们。一共有1G左右。(文末见获取关键词)。如果大家对某个微生物生态领域的期刊感兴趣,我们可以将其全部爬下来,老规矩,后台留言,点赞数量最多的,我门陆续爬给大家。

正文

全部爬虫代码

这里不仅仅爬下来文章,还统计了文章数据,大家可以自己运行一下,全部爬完也是很快的,半天就搞定了。

import requests
import json
from lxml import etree
import time
import datetime
#获取网页相应内容
def get_one_page(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except:
print("访问出错")
return None
def get_one_page_pdf(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
response = requests.get(url, headers=headers)
print(response.status_code)
if response.status_code == 200:
return response
return None
except:
print("访问出错")
return None
def parse_one_page(html):
html = etree.HTML(html)
title = html.xpath('//*[@id="main-content"]/main/article/div/h1/text()')
Accesses = html.xpath('//*[@id="main-content"]/main/article/div/div/ul/li[1]/p//text()')
DOI = html.xpath('//meta[@name="prism.doi"]/@content')
published = html.xpath('//*[@id="main-content"]/main/article/div/ul[1]/li[3]/a/time/text()')
Citations = html.xpath('//*[@id="main-content"]/main/article/div/div/ul/li[2]/p/text()')
Altmetric = html.xpath('//*[@id="main-content"]/main/article/div/div/ul/li[3]/p/text()')
author = html.xpath('//*[@id="main-content"]/main/article/div/ul[2]//text()')
list = []
for i in range(len(author)):
if len(author[i]) > 6:
list.append(author[i])
author= list[:-3]
print()
yield {
'Title': title,
'Accesses': Accesses,
'DOI': DOI,
'Citations':Citations,
'Altmetric':Altmetric,
'published':published,
'author':author
}
def get_paper_url(n):
#查找关键字网址
url = 'https://microbiomejournal.biomedcentral.com/articles?searchType=journalSearch&sort=PubDate&page={num}'.format(num=n)
# 获取相应内容
html = get_one_page(url)
# xpath解析文献href
html = etree.HTML(html)
href = html.xpath('//h3/a[@data-test="title-link"]/@href')
#对每个href进行关键字筛选
for i in href:
#单篇文献url
url = 'https://microbiomejournal.biomedcentral.com'+str(i)
doi = i.replace('/articles','')
url_pdf = 'https://microbiomejournal.biomedcentral.com/track/pdf'+doi+'#page=1'
print(url_pdf)
html = get_one_page(url)
name = []
#判断关键字是否存在于文献中
for item in parse_one_page(html):
with open('data.csv', 'a', encoding='utf-8') as f:
f.write(json.dumps(item['Title'], ensure_ascii=False)+ ',')
f.write(json.dumps(item['DOI'], ensure_ascii=False)+ ',')
f.write(json.dumps(item['Accesses'], ensure_ascii=False)+ ',')
f.write(json.dumps(item['Citations'], ensure_ascii=False)+ ',')
f.write(json.dumps(item['Altmetric'], ensure_ascii=False) + ',')
f.write(json.dumps(item['published'], ensure_ascii=False) + ',')
f.write(json.dumps(item['author'], ensure_ascii=False)+ '\n')
time.sleep(2)
f.close()
print('======读取pdf中=====')
r = get_one_page_pdf(url_pdf)
print('======保存pdf中=====')
with open('data_pdf\\'+item['Title'][0]+'.pdf', 'wb') as p:
p.write(r.content)
print('======保存完毕=====')
p.close()
time.sleep(2)
#主程序
def main():
#搜索页数
for i in range(1, 18):
n = i
try:
starttime = datetime.datetime.now()
get_paper_url(n)
endtime = datetime.datetime.now()
duringtime = endtime - starttime
print('第%d页完成' %(n))
print(duringtime.seconds)
except:
print("出错")
print("第%d页出错" % (n))
continue
#运行
if __name__ == '__main__':
main()

(0)

相关推荐