基于golang的爬虫,爬取QQ邮箱号、链接、手机号、身份证号
爬虫基础方案,基本的接口封装和使用,并为使用并发的处理
代码篇
package mainimport ( "fmt" "io/ioutil" "log" "net/http" "regexp")var ( // w代表大小写字母+数字+下划线 reEmail = `\w+@\w+\.\w+` // s?有或者没有s // +代表出1次或多次 //\s\S各种字符 // +?代表贪婪模式 reLinke = `href="(https?://[\s\S]+?)"` rePhone = `1[3456789]\d\s?\d{4}\s?\d{4}` reIdcard = `[123456789]\d{5}((19\d{2})|(20[01]\d))((0[1-9])|(1[012]))((0[1-9])|([12]\d)|(3[01]))\d{3}[\dXx]` reImg = `https?://[^"]+?(\.((jpg)|(png)|(jpeg)|(gif)|(bmp)))`)//抽取的爬邮箱func GetEmail2(url string) { pageStr := GetPageStr(url) re := regexp.MustCompile(reEmail) results := re.FindAllStringSubmatch(pageStr,-1) for _,result := range results { fmt.Println(result) }}// 爬链接func GetLink(url string) { pageStr := GetPageStr(url) re := regexp.MustCompile(reLinke) results := re.FindAllStringSubmatch(pageStr,-1) for _,result := range results { fmt.Println(result[1]) }}//爬手机号func GetPhone(url string) { pageStr := GetPageStr(url) re := regexp.MustCompile(rePhone) results := re.FindAllStringSubmatch(pageStr,-1) for _,result := range results { fmt.Println(result) }}//爬身份证号func GetIdCard(url string) { pageStr := GetPageStr(url) re := regexp.MustCompile(reIdcard) results := re.FindAllStringSubmatch(pageStr, -1) for _, result := range results { fmt.Println(result) }}// 抽取根据url获取内容func GetPageStr(url string) (pageStr string) { resp,_ := http.Get(url) //HandleError(err,"http.Get url") defer resp.Body.Close() if resp.StatusCode != 200 { log.Fatalf("status code error: %d %s", resp.StatusCode, resp.Status) } // 2.读取页面内容 pageBytes,_ := ioutil.ReadAll(resp.Body) // 字节转字符串 pageStr = string(pageBytes) return pageStr}func main() { // 抽取的爬邮箱 GetEmail2("https://tieba.baidu.com/p/6051076813?red_tag=1573533731") // 爬链接 GetLink("http://www.baidu.com/s?wd=%E8%B4%B4%E5%90%A7%20%E7%95%99%E4%B8%8B%E9%82%AE%E7%AE%B1&rsv_spt=1&rsv_iqid=0x98ace53400003985&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_dl=ib&rsv_sug2=0&inputT=5197&rsv_sug4=6345") // 爬手机号 GetPhone("https://www.zhaohaowang.com/") // 爬身份证号 GetIdCard("https://henan.qq.com/a/20171107/069413.htm")}
结果:
[1184822807@qq.com] [1184822807@qq.com] [598088118@qq.com] [598088118@qq.com] [835428013@qq.com] ------ ---- [16050271557] ---- [410222198706134038 1987 1987 06 06 13 13 ] Process finished with exit code 0
赞 (0)