1.獲取數(shù)據(jù)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
import requests def drg(url): try : head = { 'User-Agent' : 'Mozilla / 5.0 (Windows NT 10.0 ; Win64; x64) AppleWebKit / \ 537.36 (KHTML, like Gecko) Chrome / \ 91.0 . 4472.164 Safari / 537.36 '} r = requests.get(url,headers = head) r.raise_for_status() # 如果狀態(tài)不是200,引發(fā)HTTPError異常 r.encoding = r.apparent_encoding return r.text except : return "產(chǎn)生異常" url = "https://www.ip138.com/mobile.asp?mobile=13018305773&action=mobile" print (drg(url)) |
2.解析數(shù)據(jù)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
|
import requests def login(): try : # 登錄之后界面的url urllogin = "http://www.cqooc.com/user/login?username=12608199000635&password=48C032612C2A6777D28A969307B52127E198D59AA78522943C1B283CF7B89E69&nonce=6BA36BBB1F623279&cnonce=8257070573EFE28F" s = requests.session() r = s.post(urllogin,data = Form,headers = headers) r.encoding = r.apparent_encoding r.raise_for_status() return s except Exception as error: print (error) def get_html(s,url): try : r = s.get(url,headers = headers) r.encoding = r.apparent_encoding r.raise_for_status() return r.text except Exception as error: print (error) if __name__ = = "__main__" : # 登錄之后的界面user-agent headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36" , } # 跟著自己的改變 Form = { "username" : "12608199000635" , "password" : "48C032612C2A6777D28A969307B52127E198D59AA78522943C1B283CF7B89E69" , "nonce" : "6BA36BBB1F623279" , "cnonce" : "8257070573EFE28F" } lin = login() # 個(gè)人中心的網(wǎng)址 url = "http://www.cqooc.com/my/learn" html = get_html(lin,url) print (html) |
3.數(shù)據(jù)保存為CSV格式和存入數(shù)據(jù)庫(kù)
保存為CSV
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
import requests from lxml import etree import csv #獲取數(shù)據(jù) def get_html(url,time = 30 ): try : r = requests.get(url, timeout = time) r.encoding = r.apparent_encoding r.raise_for_status() return r.text except Exception as error: print (error) def parser(html): #解析函數(shù) doc = etree.HTML(html) #html轉(zhuǎn)換為soup對(duì)象 out_list = [] #解析函數(shù)輸出數(shù)據(jù)的列表 #二次查找法 for row in doc.xpath( "//*[@class='book-img-text']//li/*[@class='book-mid-info']" ): row_data = [ row.xpath( "h4/a/text()" )[ 0 ], #書(shū)名 row.xpath( "p[@class='author']/a/text()" )[ 0 ], #作者 row.xpath( "p[2]/text()" )[ 0 ].strip(), #介紹 row.xpath( "p[@class='update']/span/text()" )[ 0 ] #更新日期 ] out_list.append(row_data) #將解析的每行數(shù)據(jù)插入到輸出列表中 return out_list def save_csv(item,path): #數(shù)據(jù)存儲(chǔ),將list數(shù)據(jù)寫(xiě)入文件,防止亂碼 with open (path, "a+" , newline = '',encoding = "utf-8" ) as f: #創(chuàng)建utf8編碼文件 csv_write = csv.writer(f) #創(chuàng)建寫(xiě)入對(duì)象 csv_write.writerows(item) #一次性寫(xiě)入多行 if __name__ = = "__main__" : for i in range ( 1 , 6 ): url = "https://www.qidian.com/rank/fengyun?style=1&page={0}" . format (i) html = get_html(url) #獲取網(wǎng)頁(yè)數(shù)據(jù) out_list = parser(html) #解析網(wǎng)頁(yè),輸出列表數(shù)據(jù) save_csv(out_list, "d:\\book.csv" ) #數(shù)據(jù)存儲(chǔ) |
存入數(shù)據(jù)庫(kù)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
import pymysql import requests from lxml import etree def get_html(url, time = 3000 ): try : headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31" } r = requests.get(url, timeout = time,headers = headers) r.encoding = r.apparent_encoding r.raise_for_status() return r.text except Exception as err: print (err) result = [] def parse_html(html): html = etree.HTML(html) for row in html.xpath( '//*[@id="content"]/div/div[1]/ul/li' ): Naame = row.xpath( "div[2]/h2/a/text()" )[ 0 ].strip() #//*[@id="content"]/div/div[1]/ul[1]/div[2]/h2/a score = row.xpath( "div[2]/p[2]/span[2]/text()" )[ 0 ].strip() #//*[@id="content"]/div/div[1]/ul[1]/div[2]/p[2]/span[2] price = row.xpath( "div[2]/p[1]/text()" )[ 0 ].strip().split( "/" ) #//*[@id="content"]/div/div[1]/ul[1]/div[2]/p[1]/text() price = price[ 0 ] content = price[ 1 ] a = price[ 2 ] b = price[ - 1 ] detail = [Naame,score,price,content,a,b] result.append(detail) def join_all(sql_insert,vals, * * dbinfo): try : connet = pymysql.connect( * * dbinfo) cursor = connet.cursor() cursor.executemany(sql_insert,vals) connet.commit() print ( '添加成功!' ) except Exception as err: print (err) connet.rollback() cursor.close() if __name__ = = "__main__" : for page in range ( 1 , 16 ): url = "https://book.douban.com/latest?subcat=%E5%85%A8%E9%83%A8&p={0}" . format ( str (page)) parms = { "host" : "127.0.0.1" , "port" : 3306 , "user" : "root" , "passwd" : "123456" , "db" : "db" , "charset" : "utf8" } html = get_html(url) parse_html(html) sql_insert = "INSERT INTO db(Naame,score,price,content,a,b)\ Values( % s, % s, % s, % s, % s, % s)" join_all(sql_insert,result, * * parms) print (result) |
總結(jié)
本篇文章就到這里了,希望能夠給你帶來(lái)幫助,也希望您能夠多多關(guān)注服務(wù)器之家的更多內(nèi)容!
原文鏈接:https://blog.csdn.net/qq_50951790/article/details/120643441