用Excel来写个爬虫怎么样?
大家好!我是啥子都不精通,但是懂一丢丢Excel的E精精!
不知道大家阅读到此文是什么时辰,我现在是夜深人静,安心码字的午夜!
今天我们来给大家分享一下,如何使用Excel来采集数据! 我们就以豆瓣读书TOP250为例吧(仅用于交流学习演示使用)!
网址:https://book.douban.com/top250?icn=index-book250-all
实际翻页有变化,后面再唠!
今天我们采用的方式是VBA来处理,之前我们也分享过使用Power Query的方式!
感兴趣的,点击飞机票直达->【PQ实战 | 使用Excel收集全国天气历史数据】
爬虫涉及的东西太多,今天我们尽力简单一下,只谈实操,不涉及过多理论!
今天我们要处理的是一个静态网页,所以不涉及分析ajax等乱七八糟的东西!
复制可自行使用:
'公众号:Excel办公实战
'作者:E精精
'功能:获取HTML源码(参数页码)
'日期:20210504
'------------------------------------------------------------------
Function GetHTML(ByVal page As Long)
Dim strText As String
Dim StrURL As String
StrURL = "https://book.douban.com/top250?start=" & page * 25
strRef = "https://book.douban.com/top250?start=" & _
Application.Min(0, page - 1) * 25
With CreateObject("WinHttp.WinHttpRequest.5.1")
.Open "GET", StrURL, False
'添加请求头
.setRequestHeader "Referer", strRef
.Send
strText = .responsetext
GetHTML = strText
End With
End Function
'公众号:Excel办公实战
'作者:E精精
'功能:使用htmlfile对象提取网页内容
'日期:20210504
'-----------------------------------------------------
Function getDataByXpath(str_Html As String)
Dim htmlfile As Object, DataTable As Object
Dim arr(1 To 10000, 1 To 5)
Set htmlfile = CreateObject("htmlfile")
htmlfile.write str_Html
'等在加载解析数据
DoEvents: DoEvents
Set DataTable = htmlfile.getElementsByTagName("table")
For Each eve In DataTable
n = n + 1
'a/p/span标签
Set aNode = eve.getElementsByTagName("a")
Set pNode = eve.getElementsByTagName("p")
Set spanNode = eve.getElementsByTagName("span")
bookName = aNode(1).innerText '书名
bookInfo = pNode(0).innerText '作者/出版社/日期/价格
score = spanNode(1).innerText '评分
pCount = spanNode(2).innerText '评分人数
If pNode.Length > 1 Then
abstract = pNode(1).Children(0).innerText '概要
Else
abstract = ""
End If
'打印预览数据
Debug.Print Join(Array(bookName, bookInfo, _
score, pCount, abstract), Chr(10))
Debug.Print String(30, "-")
'写入数组
arr(n, 1) = bookName: arr(n, 2) = bookInfo
arr(n, 3) = score: arr(n, 4) = pCount
arr(n, 5) = abstract
Next
Dim maxRow As Long
With Sheet1
maxRow = .Cells(Rows.Count, 1).End(3).Row + 1
.Range("A" & maxRow).Resize(n, 5) = arr
End With
End Function
'主调用函数
Sub Main()
Dim strHtml As String
With Sheet1
.Cells.Clear
Title = [{"书名","作者等信息","评分","评分人数","概要"}]
.Range("A1").Resize(1, 5) = Title
End With
For page = 0 To 9
strHtml = GetHTML(page)
getDataByXpath strHtml
Next
End Sub
赞 (0)