你们期待的来了:帮我把ISME J 的近3000篇文献爬下来

写在前面

前几天赵向阳为我们为大家爬取了Microbiome的全部文章,数量不多,因为Microbiome创刊不久,但是isme 就不一样了,老牌的微生物生态Top期刊。文章数量达到了50*58 = 2900篇左右。需要将近20G内存存储。这次他为我们带来isme的爬取。

实战

注意一下代码使用python爬取,大家可以在新建一个文件夹:data_pdf,存储下载下来的pdf,然后运行下面代码即可。

python代码的运行推荐IDE:

文章信息表格

处理下载pdf之外,我们将文章的全部信息都做了下载,方便大家分析。存储在名为data.csv文件中。这次整个文件爬取下来耗费了20个小时,这主要是由于下载pdf耗费的时间,所以建议晚上开着,第二天看看。大家也可以直接注释掉下载pdf的代码,直接保存文章信息就快的很多。

import requests
import json
from lxml import etree
import time
import datetime
#获取网页相应内容
def get_one_page(url):
try:

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except:
print("访问出错")
return None

def get_one_page_pdf(url):
try:

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
response = requests.get(url, headers=headers)
print(response.status_code)
if response.status_code == 200:
return response
return None
except:
print("访问出错")
return None
def parse_one_page(html):
html = etree.HTML(html)
title = html.xpath('//*[@id="content"]/div/div/article/div[1]/div[1]/header/h1/text()')
Accesses = html.xpath('//*[@id="content"]/div/div/article/div[1]/div[1]/header/div/ul/li[1]/p/text()')
DOI = html.xpath('//meta[@name="prism.doi"]/@content')
published = html.xpath('//*[@id="content"]/div/div/article/div[1]/div[1]/header/ul[1]/li[3]/a/time/text()')
Citations = html.xpath('//*[@id="content"]/div/div/article/div[1]/div[1]/header/div/ul/li[2]/p/text()')
Altmetric = html.xpath('//*[@id="content"]/div/div/article/div[1]/div[1]/header/div/ul/li[2]/p/text()')
authors = html.xpath('//*[@id="content"]/div/div/article/div[1]/div[1]/header/ul[2]//text()')
list = [" \n ", "ORCID: ", ',', ', ', "ORCID: ", '\xa0']
author_list = [author for author in authors if author not in list and len(author) < 25]

yield {
'Title': title,
'Accesses': Accesses,
'DOI': DOI,
'Citations': Citations,
'Altmetric': Altmetric,
'published': published,
'author_list': author_list
}
def get_paper_url(n):
#查找关键字网址
url = 'https://www.nature.com/search?order=relevance&journal=ismej&page={num}'.format(num=n)
# 获取相应内容
html = get_one_page(url)
print('网页打开成功')
# xpath解析文献href
html = etree.HTML(html)
href = html.xpath('//h2/a[@itemprop="url"]/@href')
#对每个href进行关键字筛选
for i in href:
#单篇文献url
url = 'https://www.nature.com'+str(i)
url_pdf = 'https://www.nature.com'+i+'.pdf#page=1'
print(url_pdf)
html = get_one_page(url)
print('文章网页打开成功')
#判断关键字是否存在于文献中
for item in parse_one_page(html):
with open('data.csv', 'a', encoding='utf-8') as f:
print("===保存中===")
f.write(' '.join(item['Title'])+ ',')
f.write(' '.join(item['DOI'])+ ',')
f.write(' '.join(item['Accesses'])+ ',')
f.write(' '.join(item['Citations'])+ ',')
f.write(' '.join(item['Altmetric']) + ',')
f.write(' '.join(item['published']) + ',')
f.write(' '.join(item['author_list'])+ '\n')
print("===保存结束===")
time.sleep(2)
f.close()
print('======读取pdf中=====')
r = get_one_page_pdf(url_pdf)
print('======保存pdf中=====')
with open('data_pdf\\'+item['Title'][0]+'.pdf', 'wb') as p:
p.write(r.content)
print('======we=====')
p.close()
time.sleep(2)

#主程序
def main():
#搜索页数

for i in range(1, 58):
n = i
try:
starttime = datetime.datetime.now()
get_paper_url(n)
endtime = datetime.datetime.now()
duringtime = endtime - starttime
print('第%d页完成' %(n))
print(duringtime.seconds)
except:
print("出错")
print("第%d页出错" % (n))
continue
#运行
if __name__ == '__main__':
main()

(0)

相关推荐