淘寶的頁面很復雜,如果使用分析ajax或者js的方式,很麻煩
抓取淘寶‘美食'上面的所有食品信息
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
|
#encoding:utf8 import re from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup from config import * import pymongo client = pymongo.MongoClient(MONGODB_URL) db = client[MONGODB_DB] ##這里使用PhantomJS,并配置了一些參數 browser = webdriver.PhantomJS(service_args = SERVICE_ArGS) ##窗口的大小,不設置的話,默認太小,會有問題 browser.set_window_size( 1400 , 900 ) wait = WebDriverWait(browser, 10 ) def search(): print ( '正在搜索' ) ##容易出現超時的錯誤 try : ##等待這兩個模塊都加載好 browser.get( "https://www.taobao.com" ) input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#q' )) ) submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button' )) ) ######這塊python2搞得鬼 #input.send_keys('\u7f8e\u98df'.decode("unicode-escape")) input .send_keys(KEYWORD.decode( "unicode-escape" )) submit.click() total = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total' )) ) get_product() return total.text except TimeoutException: return search() def next_page(page_number): print ( '翻頁' + str (page_number)) try : input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input' )) ) submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit' )) ) input .clear() input .send_keys(page_number) submit.click() ##判斷是否翻頁成功 高亮的是不是輸入的值,直接加在后面即可 wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span' ), str (page_number))) get_product() except TimeoutException: return next_page(page_number) #獲取產品信息 def get_product(): products = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .m-itemlist .items' )) ) ##拿到網頁 html = browser.page_source soup = BeautifulSoup(html, 'lxml' ) items = soup.select( '#mainsrp-itemlist .m-itemlist .items .item.J_MouserOnverReq' )# print ( '*************************到此*************' ) for item in items: img = item.select( '.J_ItemPic.img' )[ 0 ].get( 'src' ) price = item.select( '.price.g_price.g_price-highlight > strong' )[ 0 ].get_text() deal = item.select( '.deal-cnt' )[ 0 ].get_text() title = item.select( '.row.row-2.title > a ' )[ 0 ].get_text().strip() #:nth-of-type(3) shop = item.select( '.row.row-3.g-clearfix > .shop > a > span:nth-of-type(2)' )[ 0 ].get_text() location = item.select( '.location' )[ 0 ].get_text() product = { 'img' :img, 'price' :price, 'deal' :deal, 'title' :title, 'shop' :shop, 'location' :location } #打印一下 import json j = json.dumps(product) dict2 = j.decode( "unicode-escape" ) print dict2 save_to_mongo(product) def save_to_mongo(product): try : if db[MONGODB_TABLE].insert(product): print ( '存儲到mongodb成功' + str (product)) except Exception: print ( "存儲到mongodb失敗" + str (product)) def main(): try : total = search() ##搜尋 re正則表達式 s = re. compile ( '(\d+)' ) total = int (s.search(total).group( 1 )) for i in range ( 2 ,total + 1 ): next_page(i) except Exception: print ( '出錯' ) finally : browser.close() if __name__ = = '__main__' : main() |
config.py
1
2
3
4
5
6
7
8
|
MONGODB_URL = 'localhost' MONGODB_DB = 'taobao' MONGODB_TABLE = 'meishi' SERVICE_ArGS = [ '--load-images=false' , '--disk-cache=true' ] ##就是美食這兩個字,直接用漢字會報錯 KEYWORD = '\u7f8e\u98df' |
以上這篇Python使用Selenium爬取淘寶異步加載的數據方法就是小編分享給大家的全部內容了,希望能給大家一個參考,也希望大家多多支持服務器之家。
原文鏈接:https://blog.csdn.net/wqh_jingsong/article/details/66472106