青青av,亚洲精品乱码久久久久久金桔影视,欧美性网

本文實例為大家分享了python 爬取網頁內容轉換為PDF的具體代碼，供大家參考，具體內容如下

將廖雪峰的學習教程轉換成PDF文件，代碼只適合該網站，如果需要其他網站的教程，可靠需要進行稍微的修改。

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

									# coding=utf-8 

									import os 

									import re 

									import time 

									import pdfkit 

									import requests 

									from bs4 import BeautifulSoup 

									from PyPDF2 import PdfFileMerger

									import sys

									reload(sys)

									sys.setdefaultencoding('utf8')

									html_template = """ 

									<!DOCTYPE html> 

									<html lang="en"> 

									<head> 

									 <meta charset="UTF-8"> 

									</head> 

									<body> 

									{content} 

									</body> 

									</html> 

									"""

									#----------------------------------------------------------------------

									def parse_url_to_html(url, name): 

									 """ 

									 解析URL，返回HTML內容 

									 :param url:解析的url 

									 :param name: 保存的html文件名 

									 :return: html 

									 """

									 try: 

									  response = requests.get(url) 

									  soup = BeautifulSoup(response.content, 'html.parser') 

									  # 正文 

									  body = soup.find_all(class_="x-wiki-content")[0] 

									  # 標題 

									  title = soup.find('h4').get_text() 

									  # 標題加入到正文的最前面，居中顯示 

									  center_tag = soup.new_tag("center") 

									  title_tag = soup.new_tag('h1') 

									  title_tag.string = title 

									  center_tag.insert(1, title_tag) 

									  body.insert(1, center_tag) 

									  html = str(body) 

									  # body中的img標簽的src相對路徑的改成絕對路徑 

									  pattern = "(<img .*?src=\")(.*?)(\")"

									  def func(m): 

									   if not m.group(3).startswith("http"): 

									    rtn = m.group(1) + "http://www.liaoxuefeng.com" + m.group(2) + m.group(3) 

									    return rtn 

									   else: 

									    return m.group(1)+m.group(2)+m.group(3) 

									  html = re.compile(pattern).sub(func, html) 

									  html = html_template.format(content=html) 

									  html = html.encode("utf-8") 

									  with open(name, 'wb') as f: 

									   f.write(html) 

									  return name 

									 except Exception as e:

									  print "解析錯誤！"

									#----------------------------------------------------------------------

									def get_url_list(): 

									 """ 

									 獲取所有URL目錄列表 

									 :return: 

									 """

									 response = requests.get("http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000") 

									 soup = BeautifulSoup(response.content, "html.parser") 

									 menu_tag = soup.find_all(class_="uk-nav uk-nav-side")[1] 

									 urls = [] 

									 for li in menu_tag.find_all("li"): 

									  url = "http://www.liaoxuefeng.com" + li.a.get('href') 

									  urls.append(url) 

									 return urls 

									#----------------------------------------------------------------------

									def save_pdf(htmls, file_name): 

									 """ 

									 把所有html文件保存到pdf文件 

									 :param htmls: html文件列表 

									 :param file_name: pdf文件名 

									 :return: 

									 """

									 options = { 

									  'page-size': 'Letter', 

									  'margin-top': '0.75in', 

									  'margin-right': '0.75in', 

									  'margin-bottom': '0.75in', 

									  'margin-left': '0.75in', 

									  'encoding': "UTF-8", 

									  'custom-header': [ 

									   ('Accept-Encoding', 'gzip') 

									  ], 

									  'cookie': [ 

									   ('cookie-name1', 'cookie-value1'), 

									   ('cookie-name2', 'cookie-value2'), 

									  ], 

									  'outline-depth': 10, 

									 } 

									 pdfkit.from_file(htmls, file_name, options=options) 

									#----------------------------------------------------------------------

									def main(): 

									 start = time.time() 

									 file_name = u"liaoxuefeng_Python3_tutorial"

									 urls = get_url_list() 

									 for index, url in enumerate(urls):

									  parse_url_to_html(url, str(index) + ".html") 

									 htmls =[] 

									 pdfs =[] 

									 for i in range(0,124): 

									  htmls.append(str(i)+'.html') 

									  pdfs.append(file_name+str(i)+'.pdf') 

									  save_pdf(str(i)+'.html', file_name+str(i)+'.pdf') 

									  print u"轉換完成第"+str(i)+'個html'

									 merger = PdfFileMerger() 

									 for pdf in pdfs:

									  merger.append(open(pdf,'rb'))

									  print u"合并完成第"+str(i)+'個pdf'+pdf 

									 output = open(u"廖雪峰Python_all.pdf", "wb") 

									 merger.write(output) 

									 print u"輸出PDF成功！"

									 for html in htmls: 

									  os.remove(html) 

									  print u"刪除臨時文件"+html 

									 for pdf in pdfs: 

									  os.remove(pdf) 

									  print u"刪除臨時文件"+pdf 

									 total_time = time.time() - start 

									 print(u"總共耗時：%f 秒" % total_time)

									#----------------------------------------------------------------------

									def changeDir(dir_name):

									 """

									 目錄切換

									 """

									 if not os.path.exists(dir_name):

									  os.mkdir(dir_name)

									 os.chdir(dir_name)

									#----------------------------------------------------------------------

									if __name__ == '__main__':

									 #存放文件的路徑

									 dir_name = '/home/Python/Html'

									 changeDir(dir_name)

									 main()