github源碼地址:
https://github.com/kuishou68/python
各類圖表的實現效果
爬取的說說內容
個性化說說內容詞云圖
每年發表說說總數柱狀圖、每年點贊和評論折線圖
7天好友動態柱狀圖、餅圖
使用方法
按照你的谷歌瀏覽器下載指定版本的驅動 http://chromedriver.storage.googleapis.com/index.html
驅動跟兩個python腳本放入同目錄,我的版本是90.0.4430的,查看你自己的版本,下載后把我的chromedriver.exe替換掉!
這里用到了很多第三方包,鼠標放在報紅的包名下,用alt+enter導包,如果失敗則在控制臺用下面的必殺技
1
|
pip install 包名 - i http: / / pypi.douban.com / simple / - - trusted - host pypi.douban.com |
主要代碼
qq空間txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
|
import time from selenium import webdriver from lxml import etree # 這里一定要設置編碼格式,防止后面寫入文件時報錯 friend = '1569339843' # 朋友的qq號,朋友的空間要求允許你能訪問 user = '783533896' # 你的qq號 pw = '1323mkonji.@' # 你的qq密碼 # 獲取瀏覽器驅動 chrome_driver = 'chromedriver.exe' driver = webdriver.chrome(executable_path = chrome_driver) # 瀏覽器窗口最大化 driver.maximize_window() # 瀏覽器地址定向為qq登陸頁面 driver.get( "http://i.qq.com" ) # 所以這里需要選中一下frame,否則找不到下面需要的網頁元素 driver.switch_to.frame( "login_frame" ) time.sleep( 3 ) # 自動點擊賬號登陸方式 driver.find_element_by_id( "switcher_plogin" ).click() time.sleep( 3 ) # 賬號輸入框輸入已知qq賬號 driver.find_element_by_id( "u" ).send_keys(user) time.sleep( 5 ) # 密碼框輸入已知密碼 driver.find_element_by_id( "p" ).send_keys(pw) time.sleep( 5 ) # 自動點擊登陸按鈕 driver.find_element_by_id( "login_button" ).click() time.sleep( 5 ) # 讓webdriver操縱當前頁 driver.switch_to.default_content() time.sleep( 5 ) # 跳到說說的url, friend你可以任意改成你想訪問的空間 driver.get( "http://user.qzone.qq.com/" + friend + "/311" ) time.sleep( 5 ) next_num = 0 # 初始“下一頁”的id while true: # 下拉滾動條,使瀏覽器加載出動態加載的內容, # 我這里是從1開始到6結束 分5 次加載完每頁數據 for i in range ( 1 , 6 ): height = 20000 * i # 每次滑動20000像素 strword = "window.scrollby(0," + str (height) + ")" driver.execute_script(strword) time.sleep( 4 ) # 很多時候網頁由多個<frame>或<iframe>組成,webdriver默認定位的是最外層的frame, # 所以這里需要選中一下說說所在的frame,否則找不到下面需要的網頁元素 driver.switch_to.frame( "app_canvas_frame" ) selector = etree.html(driver.page_source) divs = selector.xpath( '//*[@id="msglist"]/li/div[3]' ) # 這里使用 a 表示內容可以連續不清空寫入 with open ( 'qq_word.txt' , 'a' , encoding = "utf-8" ) as f: for div in divs: qq_name = div.xpath( './div[2]/a/text()' ) qq_content = div.xpath( './div[2]/pre/text()' ) qq_time = div.xpath( './div[4]/div[1]/span/a/text()' ) qq_praise = div.xpath( './div[4]/div[2]/span/span/a[2]/text()' ) qq_comment = div.xpath( './div[4]/div[2]/a[3]/text()' ) qq_name = qq_name[ 0 ] if len (qq_name) > 0 else '' qq_content = qq_content[ 0 ] if len (qq_content) > 0 else '' qq_content = qq_content.replace( '\n' , ' ' ) qq_time = qq_time[ 0 ] if len (qq_time) > 0 else '' qq_praise = qq_praise[ 0 ] if len (qq_praise) > 0 else '' qq_comment = qq_comment[ 0 ] if len (qq_comment) > 0 else '' print (qq_name, qq_time, qq_content, qq_praise, qq_comment) f.write(qq_content + "\n" ) # 當已經到了尾頁,“下一頁”這個按鈕就沒有id了,可以結束了 if driver.page_source.find( 'pager_next_' + str (next_num)) = = - 1 : break # 找到“下一頁”的按鈕,因為下一頁的按鈕是動態變化的,這里需要動態記錄一下 driver.find_element_by_id( 'pager_next_' + str (next_num)).click() # “下一頁”的id next_num + = 1 # 因為在下一個循環里首先還要把頁面下拉,所以要跳到外層的frame上 driver.switch_to.parent_frame() # 關閉瀏覽器 driver.quit() |
各種圖表的生成
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
import pandas as pd from pyecharts.charts import bar from pyecharts.charts import pie import pyecharts.options as opts from pyecharts.charts import line import re df_excel = pd.read_excel( 'qq_excel.xlsx' ) # 默認讀取sheet=0 pandas dataframe' def gettimestr(row): item = row[ '時間' ] if pd.isnull(item) | pd.isna(item): return data = item.split( '年' )[ 0 ] return data # 按年統計說說數量 def readcount(result, row): timedata = gettimestr(row) if timedata = = none: return if timedata in result.keys(): result[timedata] + = 1 else : result[timedata] = 1 # 按年統計說說點贊數 def readthumb(result, row): item = row[ '贊' ] if pd.isnull(item): return # data = re.match(r'贊\((\d+).*', item, re.m | re.i) if len (item.split( "(" )) < = 1 : return data = item.split( "(" )[ 1 ].split( ")" )[ 0 ] timedata = gettimestr(row) if timedata = = none: return if timedata in result.keys(): result[timedata] + = int (data) else : result[timedata] = int (data) # 按年統計說說評論數 def readcomment(result, row): item = row[ '評論' ] if pd.isnull(item): return # data = re.match(r'贊\((\d+).*', item, re.m | re.i) if len (item.split( "(" )) < = 1 : return data = item.split( "(" )[ 1 ].split( ")" )[ 0 ] timedata = gettimestr(row) if timedata = = none: return if timedata in result.keys(): result[timedata] + = int (data) else : result[timedata] = int (data) def readexcel(df_excel): count = {} result = {} thumb = {} comment = {} for index, row in df_excel.iterrows(): readcount(count, row) readthumb(thumb, row) readcomment(comment, row) result[ 'count' ] = count result[ 'thumb' ] = thumb result[ 'comment' ] = comment return result def getkeyandval(keyword): data = readexcel(df_excel).get(keyword) key = [] value = [] for item in data.keys(): key.append(item) value.append(data[item]) key.reverse() value.reverse() return [key, value] # 統計每年發表說說次數柱狀圖 def paintbar(): count = readexcel(df_excel).get( 'count' ) # v1 版本開始支持鏈式調用 data = getkeyandval( 'count' ) print (data[ 0 ]) d = ( bar() .add_xaxis(data[ 0 ]) .add_yaxis( "每年發表說說總數" , data[ 1 ]) .render( "每年發表說說總數柱狀圖.html" ) ) paintbar() # 統計點贊和評論折線圖 def paintline(): commentdata = getkeyandval( 'comment' ) thumbdata = getkeyandval( 'thumb' ) xaxis_data = commentdata[ 0 ] commentvalue = commentdata[ 1 ] thumbvalue = thumbdata[ 1 ] d = ( line() .add_xaxis(xaxis_data = xaxis_data) .add_yaxis( "每年評論數" , y_axis = commentvalue) .add_yaxis( "每年點贊數" , y_axis = thumbvalue) .render( "每年點贊和評論折現圖.html" ) # 輸出圖形 ) paintline() |
其他代碼自行下載項目查看
以上就是python爬取網頁版qq空間,生成各類圖表的詳細內容,更多關于python 爬取qq空間的資料請關注服務器之家其它相關文章!
原文鏈接:https://github.com/kuishou68/python