在上篇文章給大家分享PHP源碼批量抓取遠程網頁圖片并保存到本地的實現方法,感興趣的朋友可以點擊了解詳情。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
|
#-*-coding:utf-8-*- import os import uuid import urllib2 import cookielib '' '獲取文件后綴名' '' def get_file_extension( file ): return os.path.splitext( file )[1] '' '創建文件目錄,并返回該目錄' '' def mkdir (path): # 去除左右兩邊的空格 path=path.strip() # 去除尾部 \符號 path=path.rstrip( "\\" ) if not os.path.exists(path): os.makedirs(path) return path '' '自動生成一個唯一的字符串,固定長度為36' '' def unique_str(): return str(uuid.uuid1()) '' ' 抓取網頁文件內容,保存到內存 @url 欲抓取文件 ,path+filename '' ' def get_file(url): try: cj=cookielib.LWPCookieJar() opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) req=urllib2.Request(url) operate=opener. open (req) data=operate. read () return data except BaseException, e: print e return None '' ' 保存文件到本地 @path 本地路徑 @file_name 文件名 @data 文件內容 '' ' def save_file(path, file_name, data): if data == None: return mkdir (path) if (not path.endswith( "/" )): path=path+ "/" file = open (path+file_name, "wb" ) file .write(data) file .flush() file .close() #獲取文件后綴名 print get_file_extension( "123.jpg" ); #創建文件目錄,并返回該目錄 #print mkdir("d:/ljq") #自動生成一個唯一的字符串,固定長度為36 print unique_str() url= "http://qlogo1.store.qq.com/qzone/416501600/416501600/100?0" ; save_file( "d:/ljq/" , "123.jpg" , get_file(url)) |
通過Python抓取指定Url中的圖片保存至本地
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
|
# *** encoding: utf-8 *** __author__= 'jiangyt' "" " fetch images from specific url v1.0 "" " import urllib, httplib, urlparse import re import random "" "judge url exists or not" "" def httpExists(url): host, path = urlparse.urlsplit(url)[1:3] if ':' in host: # port specified, try to use it host, port = host. split ( ':' , 1) try: port = int(port) except ValueError: print 'invalid port number %r' % (port,) return False else : # no port specified, use default port port = None try: connection = httplib.HTTPConnection(host, port=port) connection.request( "HEAD" , path) resp = connection.getresponse( ) if resp.status == 200: # normal 'found' status found = True elif resp.status == 302: # recurse on temporary redirect found = httpExists(urlparse.urljoin(url,resp.getheader( 'location' , '' ))) else : # everything else -> not found print "Status %d %s : %s" % (resp.status, resp.reason, url) found = False except Exception, e: print e.__class__, e, url found = False return found "" "get html src,return lines[]" "" def gGetHtmlLines(url): if url==None : return if not httpExists(url): return try: page = urllib.urlopen(url) html = page.readlines() page.close() return html except Exception, e: print "gGetHtmlLines() error! Exception ==>>" + e return "" "get html src,return string" "" def gGetHtml(url): if url==None : return if not httpExists(url): return try: page = urllib.urlopen(url) html = page. read () page.close() return html except Exception, e: print "gGetHtml() error! Exception ==>>" + e return "" "根據url獲取文件名" "" def gGetFileName(url): if url==None: return None if url== "" : return "" arr=url. split ( "/" ) return arr[len(arr)-1] "" "生成隨機文件名" "" def gRandFilename( type ): fname = '' for i in range(16): fname = fname + chr(random.randint(65,90)) fname = fname + chr(random.randint(48,57)) return fname + '.' + type "" "根據url和其上的link,得到link的絕對地址" "" def gGetAbslLink(url,link): if url==None or link == None : return if url== '' or link== '' : return url addr = '' if link[0] == '/' : addr = gGetHttpAddr(url) + link elif len(link)>3 and link[0:4] == 'http' : addr = link elif len(link)>2 and link[0:2] == '..' : addr = gGetHttpAddrFatherAssign(url,link) else : addr = gGetHttpAddrFather(url) + link return addr "" "根據輸入的lines,匹配正則表達式,返回list" "" def gGetRegList(linesList,regx): if linesList==None : return rtnList=[] for line in linesList: matchs = re.search(regx, line, re.IGNORECASE) if matchs!=None: allGroups = matchs. groups () for foundStr in allGroups: if foundStr not in rtnList: rtnList.append(foundStr) return rtnList "" "根據url下載文件,文件名參數指定" "" def gDownloadWithFilename(url,savePath, file ): #參數檢查,現忽略 try: urlopen=urllib.URLopener() fp = urlopen. open (url) data = fp. read () fp.close() file = open (savePath + file , 'w+b' ) file .write(data) file .close() except IOError, error: print "DOWNLOAD %s ERROR!==>>%s" % (url, error) except Exception, e: print "Exception==>>" + e "" "根據url下載文件,文件名自動從url獲取" "" def gDownload(url,savePath): #參數檢查,現忽略 fileName = gGetFileName(url) #fileName =gRandFilename('jpg') gDownloadWithFilename(url,savePath,fileName) "" "根據某網頁的url,下載該網頁的jpg" "" def gDownloadHtmlJpg(downloadUrl,savePath): lines= gGetHtmlLines(downloadUrl) # 'get the page source' regx = r "" "src\s*=" ?(\S+)\.jpg "" " lists =gGetRegList(lines,regx) #'get the links which match regular express' if lists==None: return for jpg in lists: jpg = gGetAbslLink(downloadUrl, jpg) + '.jpg' gDownload(jpg,savePath) print gGetFileName(jpg) "" "根據url取主站地址" "" def gGetHttpAddr(url): if url== '' : return '' arr=url. split ( "/" ) return arr[0]+ "//" +arr[2] "" "根據url取上級目錄" "" def gGetHttpAddrFather(url): if url== '' : return '' arr=url. split ( "/" ) addr = arr[0]+ '//' +arr[2]+ '/' if len(arr)-1>3 : for i in range(3,len(arr)-1): addr = addr + arr[i] + '/' return addr "" "根據url和上級的link取link的絕對地址" "" def gGetHttpAddrFatherAssign(url,link): if url== '' : return '' if link== '' : return '' linkArray=link. split ( "/" ) urlArray = url. split ( "/" ) partLink = '' partUrl = '' for i in range(len(linkArray)): if linkArray[i]== '..' : numOfFather = i + 1 #上級數 else : partLink = partLink + '/' + linkArray[i] for i in range(len(urlArray)-1-numOfFather): partUrl = partUrl + urlArray[i] if i < len(urlArray)-1-numOfFather -1 : partUrl = partUrl + '/' return partUrl + partLink "" "根據url獲取其上的相關htm、html鏈接,返回list" "" def gGetHtmlLink(url): #參數檢查,現忽略 rtnList=[] lines=gGetHtmlLines(url) regx = r "" "href=" ?(\S+)\.htm "" " for link in gGetRegList(lines,regx): link = gGetAbslLink(url,link) + '.htm' if link not in rtnList: rtnList.append(link) print link return rtnList "" "根據url,抓取其上的jpg和其鏈接htm上的jpg" "" def gDownloadAllJpg(url,savePath): #參數檢查,現忽略 gDownloadHtmlJpg(url,savePath) #抓取link上的jpg links=gGetHtmlLink(url) for link in links: gDownloadHtmlJpg(link,savePath) "" "test" "" def main(): u= 'http://site.douban.com/196738/room/2462453/' #想要抓取圖片的地址 save= '/root/python/tmp/' #圖片所要存放的目錄 print 'download pic from [' + u + ']' print 'save to [' +save+ '] ...' gDownloadHtmlJpg(u,save) print "download finished" if __name__ == "__main__" : main() else : print "called from intern." |
以上代碼是小編給大家介紹的python抓取網頁中圖片并保存到本地的全部內容,希望大家喜歡。