一段快乐的代码
# 思路# 1.获取贴吧主页URL,下一页,找URL规律# 2.获取 1 页中每个帖子的URL# 3.对每个帖子 URL 发请求,获取帖子中 图片URL# 4.以此对图片URL发请求,以 wb 方式保存到本地# 帖子链接链表 = parseHtml.xpath('..')# for 1个帖子链接 in 帖子链接列表:# html = 对每个帖子发请求得到响应# for 1个图片链接 in 图片链接列表# with open('ll.jpg', 'wb') as f:# f.write()# //div[@class="t_con cleafix"]/div/div/div/a/@href# //div[@class="d_post_content j_d_post_content clearfix"]/img[@class="BDE_Image"]/@src# kw=%E6%A0%A1%E8%8A%B1&pn=100 from lxml import etreeimport requestsimport urllib.parseclass BaiduImgSpider: def __init__(self): self.baseurl = 'http://tieba.baidu.com' self.headers = {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"} self.mainurl = 'http://tieba.baidu.com/f?' # 获取所有帖子的 URL 列表 def getPageUrl(self, params): # 发请求 res = requests.get(self.mainurl,params=params,headers=self.headers) res.encoding = 'utf-8' html = res.text # 提取页面中的href parseHtml = etree.HTML(html) tList = parseHtml.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href') for t in tList: tLink = self.baseurl + t self.getImgUrl(tLink) # 获取 一个 帖子中所有图片的URL列表 def getImgUrl(self, tLink): # 获取一个贴子的响应内容 res = requests.get(tLink,headers=self.headers) res.encoding = 'utf-8' html = res.text # 提取图片的 src parseHtml = etree.HTML(html) imgList = parseHtml.xpath('//div[@class="video_src_wrapper"]/embed/@data-video | //div[@class="d_post_content j_d_post_content clearfix"]/img[@class="BDE_Image"]/@src') # 依次遍历图片链接调用写入函数 for img in imgList: self.writeImage(img) # 把图片保存到本地 def writeImage(self, img): # 对图片链接发起请求,获取res.content res = requests.get(img, headers=self.headers) res.encoding = 'utf-8' # 二进制文件 html = res.content # 写入本地文件 filename = img[-12:] with open(filename, 'wb') as f: f.write(html) print("%s下载成功" % filename) # 主函数 def workOn(self): name = input('输入要爬取的贴吧名称:') begin = int(input("起始页:")) end = int(input("终止页:")) for n in range(begin, end+1): pn = (n-1) * 50 params = { "kw": name, "pn": pn } self.getPageUrl(params)# params = urllib.parse.urlencode(params)# # 拼接URL# url = self.baseurl + "/f?" + params# self.getPageUrl(url) if __name__ == "__main__": spider = BaiduImgSpider() spider.workOn()
the end
赞 (0)