一名合格的數(shù)據(jù)分析師分享Python網(wǎng)絡(luò)爬蟲二三事(綜合實(shí)戰(zhàn)案例)
接上篇文章《一名合格的數(shù)據(jù)分析師分享Python網(wǎng)絡(luò)爬蟲二三事》
五、綜合實(shí)戰(zhàn)案例
1. 爬取靜態(tài)網(wǎng)頁數(shù)據(jù)
(1)需求
爬取豆瓣網(wǎng)出版社名字并分別存儲到excel、txt與MySQL數(shù)據(jù)庫中。
(2)分析
- 查看源碼
- Ctrl+F搜索任意出版社名字,如博集天卷
- 確定正則模式
- "<div class="name">(.*?)</div>"
(3)思路
- 下載目標(biāo)頁面
- 正則匹配目標(biāo)內(nèi)容
- Python列表存儲
- 寫入Excel/txt/MySQL
(4)源碼
- ''信息存儲'''import urllibimport reimport xlsxwriterimport MySQLdb#-----------------(1)存儲到excel與txt-------------------------#def gxls_concent(target_url,pat):
- '''
- 功能:爬取數(shù)據(jù)
- @target_url:爬取目標(biāo)網(wǎng)址
- @pat:數(shù)據(jù)過濾模式
- '''
- data = urllib.request.urlopen(target_url).read()
- reret_concent = re.compile(pat).findall(str(data,'utf-8'))
- return ret_concentdef wxls_concent(ret_xls,ret_concent):
- '''
- 功能:將最終結(jié)果寫入douban.xls中
- @ret_xls:最終結(jié)果存儲excel表的路徑
- @ret_concent:爬取數(shù)據(jù)結(jié)果列表
- '''
- # 打開最終寫入的文件
- wb1 = xlsxwriter.Workbook(ret_xls)
- # 創(chuàng)建一個sheet工作對象
- ws = wb1.add_worksheet()
- try:
- for i in range(len(ret_concent)):
- data = ret_concent[i]
- ws.write(i,0,data)
- wb1.close()
- except Exception as er:
- print('寫入“'+ret_xls+'”文件時出現(xiàn)錯誤')
- print(er) def wtxt_concent(ret_txt,ret_concent):
- '''
- 功能:將最終結(jié)果寫入douban.txt中
- @ret_xls:最終結(jié)果存儲excel表的路徑
- @ret_concent:爬取數(shù)據(jù)結(jié)果列表
- '''
- fh = open(ret_txt,"wb")
- try:
- for i in range(len(ret_concent)):
- data = ret_concent[i]
- datadata = data+"\r\n"
- datadata = data.encode()
- fh.write(data)
- except Exception as er:
- print('寫入“'+ret_txt+'”文件時出現(xiàn)錯誤')
- print(er)
- fh.close()def mainXlsTxt():
- '''
- 功能:將數(shù)據(jù)存儲到excel表中
- '''
- target_url = 'https://read.douban.com/provider/all' # 爬取目標(biāo)網(wǎng)址
- pat = '<div>(.*?)</div>' # 爬取模式
- ret_xls = "F:/spider_ret/douban.xls" # excel文件路徑
- ret_txt = "F:/spider_ret/douban.txt" # txt文件路徑
- ret_concent = gxls_concent(target_url,pat) # 獲取數(shù)據(jù)
- wxls_concent(ret_xls,ret_concent) # 寫入excel表
- wtxt_concent(ret_txt,ret_concent) # 寫入txt文件 #---------------------END(1)--------------------------------##-------------------(2)存儲到MySQL---------------------------#def db_con():
- '''
- 功能:連接MySQL數(shù)據(jù)庫
- '''
- con = MySQLdb.connect(
- host='localhost', # port
- user='root', # usr_name
- passwd='xxxx', # passname
- db='urllib_data', # db_name
- charset='utf8',
- local_infile = 1
- )
- return con def exeSQL(sql):
- '''
- 功能:數(shù)據(jù)庫查詢函數(shù)
- @sql:定義SQL語句
- '''
- print("exeSQL: " + sql)
- #連接數(shù)據(jù)庫
- con = db_con()
- con.query(sql) def gdb_concent(target_url,pat):
- '''
- 功能:轉(zhuǎn)換爬取數(shù)據(jù)為插入數(shù)據(jù)庫格式:[[value_1],[value_2],...,[value_n]]
- @target_url:爬取目標(biāo)網(wǎng)址
- @pat:數(shù)據(jù)過濾模式
- '''
- tmp_concent = gxls_concent(target_url,pat)
- ret_concent = []
- for i in range(len(tmp_concent)):
- ret_concent.append([tmp_concent[i]])
- return ret_concentdef wdb_concent(tbl_name,ret_concent):
- '''
- 功能:將爬取結(jié)果寫入MySQL數(shù)據(jù)庫中
- @tbl_name:數(shù)據(jù)表名
- @ret_concent:爬取數(shù)據(jù)結(jié)果列表
- '''
- exeSQL("drop table if exists " + tbl_name)
- exeSQL("create table " + tbl_name + "(pro_name VARCHAR(100));")
- insert_sql = "insert into " + tbl_name + " values(%s);"
- con = db_con()
- cursor = con.cursor()
- try:
- cursor.executemany(insert_sql,ret_concent)
- except Exception as er:
- print('執(zhí)行MySQL:"' + str(insert_sql) + '"時出錯')
- print(er)
- finally:
- cursor.close()
- con.commit()
- con.close()def mainDb():
- '''
- 功能:將數(shù)據(jù)存儲到MySQL數(shù)據(jù)庫中
- '''
- target_url = 'https://read.douban.com/provider/all' # 爬取目標(biāo)網(wǎng)址
- pat = '<div>(.*?)</div>' # 爬取模式
- tbl_name = "provider" # 數(shù)據(jù)表名
- # 獲取數(shù)據(jù)
- ret_concent = gdb_concent(target_url,pat)
- # 寫入MySQL數(shù)據(jù)庫
- wdb_concent(tbl_name,ret_concent) #---------------------END(2)--------------------------------#if __name__ == '__main__':
- mainXlsTxt()
- mainDb()
(5)結(jié)果
2. 爬取基于Ajax技術(shù)網(wǎng)頁數(shù)據(jù)
(1)需求
爬取拉勾網(wǎng)廣州的數(shù)據(jù)挖掘崗位信息并存儲到本地Excel文件中
(2)分析
a. 崗位數(shù)據(jù)在哪里?
- 打開拉勾網(wǎng)==》輸入關(guān)鍵詞“數(shù)據(jù)挖掘”==》查看源碼==》沒發(fā)現(xiàn)崗位信息
- 打開拉勾網(wǎng)==》輸入關(guān)鍵詞“數(shù)據(jù)挖掘”==》按F12==》Network刷新==》按下圖操作
我們可以發(fā)現(xiàn)存在position和company開頭的json文件,這很可能就是我們所需要的崗位信息,右擊選擇open link in new tab,可以發(fā)現(xiàn)其就是我們所需的內(nèi)容。
b. 如何實(shí)現(xiàn)翻頁?
我們在寫爬蟲的時候需要多頁爬取,自動模擬換頁操作。首先我們點(diǎn)擊下一頁,可以看到url沒有改變,這也就是Ajax(異步加載)的技術(shù)。點(diǎn)擊position的json文件,在右側(cè)點(diǎn)擊Headers欄,可以發(fā)現(xiàn)***部有如下內(nèi)容:
當(dāng)我們換頁的時候pn則變?yōu)?且first變?yōu)閒alse,故我們可以通過構(gòu)造post表單進(jìn)行爬取。
c. Json數(shù)據(jù)結(jié)構(gòu)怎么樣?
(3)源碼
- import urllib.requestimport urllib.parseimport socketfrom multiprocessing.dummy import Poolimport jsonimport timeimport xlsxwriter#----------------------------------------------------------#######(1)獲取代理IP###def getProxies():
- '''
- 功能:調(diào)用API獲取原始代理IP池
- '''
- url = "http://api.xicidaili.com/free2016.txt"
- i_headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0"}
- global proxy_addr
- proxy_addr = []
- try:
- req = urllib.request.Request(url,headers = i_headers)
- proxy = urllib.request.urlopen(req).read()
- proxyproxy = proxy.decode('utf-8')
- proxyproxy_addr = proxy.split('\r\n') #設(shè)置分隔符為換行符
- except Exception as er:
- print(er)
- return proxy_addr def testProxy(curr_ip):
- '''
- 功能:利用百度首頁,逐個驗(yàn)證代理IP的有效性
- @curr_ip:當(dāng)前被驗(yàn)證的IP
- '''
- socket.setdefaulttimeout(5) #設(shè)置全局超時時間
- tarURL = "https://www.baidu.com/" #測試網(wǎng)址
- proxy_ip = []
- try:
- proxy_support = urllib.request.ProxyHandler({"http":curr_ip})
- opener = urllib.request.build_opener(proxy_support)
- opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")]
- urllib.request.install_opener(opener)
- res = urllib.request.urlopen(tarURL).read()
- proxy_ip.append(curr_ip)
- print(len(res))
- except Exception as er:
- print("驗(yàn)證代理IP("+curr_ip+")時發(fā)生錯誤:"+er)
- return proxy_ip def mulTestProxies(proxies_ip):
- '''
- 功能:構(gòu)建多進(jìn)程驗(yàn)證所有代理IP
- @proxies_ip:代理IP池
- '''
- pool = Pool(processes=4) #開啟四個進(jìn)程
- proxies_addr = pool.map(testProxy,proxies_ip)
- pool.close()
- pool.join() #等待進(jìn)程池中的worker進(jìn)程執(zhí)行完畢
- return proxies_addr#----------------------------------------------------------#######(2)爬取數(shù)據(jù)###def getInfoDict(url,page,pos_words_one,proxy_addr_one):
- '''
- 功能:獲取單頁職位數(shù)據(jù),返回?cái)?shù)據(jù)字典
- @url:目標(biāo)URL
- @page:爬取第幾頁
- @pos_words_one:搜索關(guān)鍵詞(單個)
- @proxy_addr_one:使用的代理IP(單個)
- '''
- global pos_dict
- page = 1
- i_headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
- proxy = urllib.request.ProxyHandler({"http":proxy_addr_one})
- opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
- opener.addheaders=[i_headers]
- urllib.request.install_opener(opener)
- if page==1:
- tORf = "true"
- else:
- tORf = "false"
- mydata = urllib.parse.urlencode({"first": tORf,
- "pn": page, #pn變化實(shí)現(xiàn)翻頁
- "kd": pos_words_one } ).encode("utf-8")
- try:
- req = urllib.request.Request(url,mydata)
- data=urllib.request.urlopen(req).read().decode("utf-8","ignore") #利用代理ip打開
- pos_dict = json.loads(data) #將str轉(zhuǎn)成dict
- except urllib.error.URLError as er:
- if hasattr(er,"code"):
- print("獲取職位信息json對象時發(fā)生URLError錯誤,錯誤代碼:")
- print(er.code)
- if hasattr(er,"reason"):
- print("獲取職位信息json對象時發(fā)生URLError錯誤,錯誤原因:")
- print(er.reason)
- return pos_dictdef getInfoList(pos_dict):
- '''
- 功能:將getInfoDict()返回的數(shù)據(jù)字典轉(zhuǎn)換為數(shù)據(jù)列表
- @pos_dict:職位信息數(shù)據(jù)字典
- '''
- pos_list = [] #職位信息列表
- jcontent = pos_dict["content"]["positionResult"]["result"]
- for i in jcontent:
- one_info = [] #一個職位的相關(guān)信息
- one_info.append(i["companyFullName"])
- one_info.append(i['companySize'])
- one_info.append(i['positionName'])
- one_info.append(i['education'])
- one_info.append(i['financeStage'])
- one_info.append(i['salary'])
- one_info.append(i['city'])
- one_info.append(i['district'])
- one_info.append(i['positionAdvantage'])
- one_info.append(i['workYear'])
- pos_list.append(one_info)
- return pos_listdef getPosInfo(pos_words,city_words,proxy_addr):
- '''
- 功能:基于函數(shù)getInfoDict()與getInfoList(),循環(huán)遍歷每一頁獲取最終所有職位信息列表
- @pos_words:職位關(guān)鍵詞(多個)
- @city_words:限制城市關(guān)鍵詞(多個)
- @proxy_addr:使用的代理IP池(多個)
- '''
- posInfo_result = []
- title = ['公司全名', '公司規(guī)模', '職位名稱', '教育程度', '融資情況', "薪資水平", "城市", "區(qū)域", "優(yōu)勢", "工作經(jīng)驗(yàn)"]
- posInfo_result.append(title)
- for i in range(0,len(city_words)):
- #i = 0
- key_city = urllib.request.quote(city_words[i])
- #篩選關(guān)鍵詞設(shè)置:gj=應(yīng)屆畢業(yè)生&xl=大專&jd=成長型&hy=移動互聯(lián)網(wǎng)&px=new&city=廣州
- url = "https://www.lagou.com/jobs/positionAjax.json?city="+key_city+"&needAddtionalResult=false"
- for j in range(0,len(pos_words)):
- #j = 0
- page=1
- while page<10: #每個關(guān)鍵詞搜索拉鉤顯示30頁,在此只爬取10頁
- pos_wordspos_words_one = pos_words[j]
- #k = 1
- proxy_addrproxy_addr_one = proxy_addr[page]
- #page += 1
- time.sleep(3)
- pos_info = getInfoDict(url,page,pos_words_one,proxy_addr_one) #獲取單頁信息列表
- pos_infoList = getInfoList(pos_info)
- posInfo_result += pos_infoList #累加所有頁面信息
- page += 1
- return posInfo_result#----------------------------------------------------------#######(3)存儲數(shù)據(jù)###def wXlsConcent(export_path,posInfo_result):
- '''
- 功能:將最終結(jié)果寫入本地excel文件中
- @export_path:導(dǎo)出路徑
- @posInfo_result:爬取的數(shù)據(jù)列表
- '''
- # 打開最終寫入的文件
- wb1 = xlsxwriter.Workbook(export_path)
- # 創(chuàng)建一個sheet工作對象
- ws = wb1.add_worksheet()
- try:
- for i in range(0,len(posInfo_result)):
- for j in range(0,len(posInfo_result[i])):
- data = posInfo_result[i][j]
- ws.write(i,j,data)
- wb1.close()
- except Exception as er:
- print('寫入“'+export_path+'”文件時出現(xiàn)錯誤:')
- print(er)#----------------------------------------------------------#######(4)定義main()函數(shù)###def main():
- '''
- 功能:主函數(shù),調(diào)用相關(guān)函數(shù),最終輸出路徑(F:/spider_ret)下的positionInfo.xls文件
- '''
- #---(1)獲取代理IP池
- proxies = getProxies() #獲取原始代理IP
- proxy_addr = mulTestProxies(proxies) #多線程測試原始代理IP
- #---(2)爬取數(shù)據(jù)
- search_key = ["數(shù)據(jù)挖掘"] #設(shè)置職位關(guān)鍵詞(可以設(shè)置多個)
- city_word = ["廣州"] #設(shè)置搜索地區(qū)(可以設(shè)置多個)
- posInfo_result = getPosInfo(search_key,city_word,proxy_addr) #爬取職位信息
- #---(3)存儲數(shù)據(jù)
- export_path = "F:/spider_ret/positionInfo.xls" #設(shè)置導(dǎo)出路徑
- wXlsConcent(export_path,posInfo_result) #寫入到excel中 if __name__ == "__main__":
- main()
接下篇文章《一名合格的數(shù)據(jù)分析師分享Python網(wǎng)絡(luò)爬蟲二三事(Scrapy自動爬蟲)》
【本文是51CTO專欄機(jī)構(gòu)“豈安科技”的原創(chuàng)文章,轉(zhuǎn)載請通過微信公眾號(bigsec)聯(lián)系原作者】