Python保存.mht格式网页

前言

自动化完成目标网页的保存,文件名为标题名,网页可离线打开,并保证不缺损信息。 # 实现方案 使用善于爬网页的python实现 方式1:urllib保存html urllib是一个包含request、error、parse、robotparser四个模块,关乎网络资源请求的包 调用urllib完成版本1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import urllib.request 

def getHtml(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request).read()
return response

def saveHtml(file_name, file_content):
with open(file_name.replace('/', '_') + ".html", "wb") as f:
f.write(file_content)

base_url = 'http://www.weain.mil.cn/cggg/zbgg/'
base_path = 'E:\\jkl\\'

for p in range(615675, 615682):
try:
result = getHtml(base_url + str(p) + '.html')
print(base_url + str(p) + '.html')
path = base_path + str(p)
# print(path)
saveHtml(path, result)
p = p+1
except:
continue

print("done")

问题 1.1 反爬 该网站虽然简单,web服务应该还是添加了基本的反爬措施 解决:需要在request时添加浏览器headers,将程序行为掩饰为浏览器行为

1.2 html信息不全 抓下来html文件信息不全,图片损失,网页框架混乱 该方式只适合快速抓取关键字的场景

方式2:Chilkat保存MHTML(mht) MHTML = MIME Encapsulation of Aggregate HTML Documents 将一个多附件网页(图片、flash、Java小程序)存储为单一文档,可用于发送html电子邮件

Chilkat是一个功能强大的有直接提供保存mht格式文件接口的包 调用Chilkat完成版本2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# !/usr/bin/python
# -*-coding:utf-8-*-

import sys
import chilkat
import urllib.request

from bs4 import BeautifulSoup

def get_title(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"}
request = urllib.request.Request(url, headers=headers)
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html, 'lxml')
print(soup.find('h1').get_text())
return soup.find('h1').get_text()

mht = chilkat.CkMht()

success = mht.UnlockComponent("Anything for 30-day trial")
if (success != True):
print(mht.lastErrorText())
sys.exit()

base_url = "http://www.weain.mil.cn/cggg/zbgg/"
base_path = "C:\\Users\\x.x-pc\\Desktop\\html_grep\\save\\"

if __name__ == '__main__':
'''
print("Please Input Web Page Index: ")
start = input("please input web page index: ")
print("start >> " + start)
print("end >> " + end)
print(start + " " + end);
sys.exit()
'''

for p in range(615675, 615676):
# try:
target_url = base_url + str(p) + '.html'
save_path = base_path + get_title(target_url) + '.mht'
print(target_url)
print(save_path)

success = mht.GetAndSaveMHT(target_url, save_path)
if (success != True):
print(mht.lastErrorText())
sys.exit()

p = p + 1
# except:
# print("Exception");
# continue

print("done")

问题: 2.1 该方式可以较好的解决问题1.2,但是依然有部分图片信息缺损 与chrome浏览器直接右键保存为mht格式文件信息差距较大

2.2 该网页采用的是动态加载的方式,发布的采购公告信息使用Javascript加载 Chilkat只能抓取到当前页面,无法抓到动态页面 接下来就是想办法抓到动态页面,然后拼接到静态页面 emmm ……

方式3:Selenium Selenium是一个用于Web应用程序测试工具 Selenium测试直接运行在浏览器中,就像真正的用户在操作一样 支持的浏览器包括IE(7, 8, 9, 10, 11),Mozilla Firefox,Safari,Google Chrome,Opera等。 同时python支持对应的selenium包 selenium直接与chromedriver进行交互,调用chromedriver原生api实现浏览器的操作 可以利用浏览器保存完整的mht文件

问题: 保存网页时默认获取的是html的title字段,导致每个网页该字段都一样 真正的标题则在h1字段,需要修改windows保存窗口的命名栏 而操作windows窗口则超过了selenium的能力范畴

为了解决该问题,先后搜查了如下方式: AutoHotKey、AutoIt 其中AutoHotKey是一款免费的、Windows平台下开放源代码的热键脚本语言 可以通过脚本调用键鼠、系统接口及程序,并创建基于简单语言的图形化界面的执行程序

该工具非常强大,可以自动化完成高重复性操作,可适用于自动化测试、生产中

AutoHotKey可将该语言脚本例化成exe,供python调用 AutoIt则有对应的pyautoit包,最终实现也是调用AutoItX3.dll

pyautogui包可以操作键鼠,不需要调用其他动态库,且使用相对简单,为该问题最后选取的方案 为了提高人机交互,使用tkinter实现了简单的对话框界面。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
# -*- coding: utf-8 -*-

import os
import time
import math

import tkinter.messagebox
from tkinter import *
from tkinter.filedialog import askdirectory

import pyautogui
import pyperclip
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service


def init_driver(path, quiet=True):
service = Service(executable_path=path)
# service.command_line_args()
service.start()

custom_options = webdriver.ChromeOptions()
custom_options.add_argument("--save-page-as-mhtml")
custom_options.add_argument("--start-maximized")
custom_options.add_argument('--disable-infobars')
custom_options.add_argument('--ignore-certificate-errors')
custom_options.add_argument('--ignore-ssl-errors')
if quiet:
# 静默
custom_options.add_argument('--headless')
custom_options.add_argument('--disable-gpu')
custom_options.add_argument("--hide-scrollbars")

driver = webdriver.Chrome(executable_path=path, options=custom_options)
return service, driver


def exit_driver(service, driver):
driver.quit()
service.stop()


def save_page_as(name):
# 保存
pyautogui.hotkey('ctrl', 's')
time.sleep(1)
# pyautogui不支持中文
pyperclip.copy(name)
pyautogui.hotkey('ctrl', 'v')
time.sleep(1)
pyautogui.hotkey('enter')
time.sleep(1)
# 暂不支持文件已存在(覆盖保存)
# pyautogui.hotkey('y')
# time.sleep(3)


def save_mht(driver_path, tgt_url_list, file_path_list):
service, driver = init_driver(driver_path, quiet=False)

# end_cnt = 1
opened_chrome_tab_cnt = 0
allowed_chrome_tab_cnt = 5
for i in range(4):
# print("len(tgt_url_list)" + str(i) + " = " + str(len(tgt_url_list[i])))
print(file_path_list[i])
if len(tgt_url_list[i]) != 0:
for url in tgt_url_list[i]:
driver.get(url)
opened_chrome_tab_cnt += 1
title = driver.find_element_by_tag_name('h1')
print(title.text)
save_page_as(file_path_list[i] + "\\" + title.text)
# # 避免结束多开1个新标签页
# if end_cnt == len(tgt_url_list[i]):
# break
# else:
# end_cnt += 1
# 打开新标签页
pyautogui.hotkey('ctrl', 't')
driver.switch_to.window(driver.window_handles[-1])
# 暂时规避chrome无法修改搜索引擎导致新标签页阻塞连接的问题
pyperclip.copy(url)
pyautogui.hotkey('ctrl', 'v')
pyautogui.hotkey('enter')

# 处理已开标签避免chrome标签过多将内存耗尽
if opened_chrome_tab_cnt == allowed_chrome_tab_cnt:
handle = driver.current_window_handle
while opened_chrome_tab_cnt != 0:
driver.switch_to.window(driver.window_handles[0])
driver.close()
opened_chrome_tab_cnt -= 1
time.sleep(2)
driver.switch_to.window(handle)

time.sleep(6)
exit_driver(service, driver)


def get_tgt_url(driver_path, start_index_list, start_item_list, item_total_cnt_list):
service, driver = init_driver(driver_path)
item_per_page_cnt = 8
index_page_cnt_list = [0, 0, 0, 0]
tgt_url_list = [[] for i in range(4)]
home_page_url = "http://www.weain.mil.cn"
index_page_id_list = ["/cggg/zbgg/", "/cggg/zbgg1/", "/cggg/gzgg/", "/cggg/qtgg/"]

for i in range(4):
if item_total_cnt_list[i] != "":
# 向上取整
index_page_cnt_list[i] = math.ceil((int(start_item_list[i]) + int(item_total_cnt_list[i]) - 1) / item_per_page_cnt)
# print(index_page_cnt_list[i])

href_cnt = 1
for page_index in range(index_page_cnt_list[i]):
# 获取索引页地址
if start_index_list[i] == "1" and page_index == 0:
index_page_url = home_page_url + index_page_id_list[i] + "index.html"
else:
index_page_url = home_page_url + index_page_id_list[i] + \
"index_" + str(int(start_index_list[i]) + page_index) + ".html"

print(index_page_url)
# 索引页内获取tgt_url
driver.get(index_page_url)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
href_list = soup.find_all(href=re.compile("^" + index_page_id_list[i]), target=True, title=True)
# print(href_list)
for j in href_list:
# print(j)
# print(home_page_url + j['href'])
if href_cnt != int(start_item_list[i]):
href_cnt += 1
else:
print(home_page_url + j['href'])
tgt_url_list[i].append(home_page_url + j['href'])
# print(str(len(tgt_url_list[i])) + " -- " + item_total_cnt_list[i])
if len(tgt_url_list[i]) == int(item_total_cnt_list[i]):
break

exit_driver(service, driver)
# print(tgt_url_list)
return tgt_url_list


def center_window(root, width, height):
screenwidth = root.winfo_screenwidth()
screenheight = root.winfo_screenheight()
size = '%dx%d+%d+%d' % (width, height, (screenwidth - width) / 2, (screenheight - height) / 2)
# print(size)
root.geometry(size)


def main_process(cb_name_list, start_index_list, start_item_list, item_total_cnt_list, file_save_path):
# 获取当前路径(简化部署默认chromedriver在同一路径)
pwd = os.path.split(os.path.realpath(__file__))[0]
chrome_driver_path = pwd + "\\chromedriver.exe"
# print(chrome_driver_path)
# 检查chromedriver是否存在
if not os.path.exists(chrome_driver_path):
# raise Exception("chromedriver必须与exe同一路径!")
tkinter.messagebox.showerror("错误", "chromedriver必须与exe同一路径")
exit(0)

# 获得当前系统时间创建文件夹以规避文件覆盖问题
localtime = time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime(time.time()))
file_save_path = file_save_path + "\\" + localtime
# print(file_save_path)
os.mkdir(file_save_path)

file_save_path_list = []
for i in range(4):
file_save_path_list.append(file_save_path + "\\" + cb_name_list[i])
if item_total_cnt_list[i] != "":
os.mkdir(file_save_path_list[i])

# 静默获取target_url
target_url_list = get_tgt_url(chrome_driver_path, start_index_list, start_item_list, item_total_cnt_list)

# 保存target_url
save_mht(chrome_driver_path, target_url_list, file_save_path_list)


def main():
root = Tk()
root.title("保存MHT")
root.resizable(width=False, height=False)
center_window(root, 550, 400)

def select_path():
path_ = askdirectory()
save_path_text.set(path_.replace("/", "\\"))

def check():
for i in range(4):
if cb_var_list[i].get() == '0':
entry_item_cnt_list[i].delete(0, END)

entry_start_index_list = []
entry_start_item_list = []
entry_item_cnt_list = []
cb_name_list = ["招标公告", "中标公告", "更正公告", "其他公告"]
cb_var_list = [StringVar() for n in range(4)]
start_index_text_list = [StringVar() for n in range(4)]
start_item_text_list = [StringVar() for n in range(4)]
item_cnt_text_list = [StringVar() for n in range(4)]

for i in range(4):
frame = Frame(root)
cb_var_list[i].set(0)
Checkbutton(frame, text=cb_name_list[i], variable=cb_var_list[i], command=check).grid(row=0)

Label(frame, text="起始索引页:").grid(row=1, column=0)
start_index_text_list[i].set("1")
entry_start_index_list.append(Entry(frame, textvariable=start_index_text_list[i], width=10))
entry_start_index_list[i].grid(row=1, column=1)

Label(frame, text="起始条目:").grid(row=1, column=2, padx=20)
start_item_text_list[i].set("1")
entry_start_item_list.append(Entry(frame, textvariable=start_item_text_list[i], width=10))
entry_start_item_list[i].grid(row=1, column=3)

Label(frame, text="保存条目数:").grid(row=1, column=4, padx=20)
entry_item_cnt_list.append(Entry(frame, textvariable=item_cnt_text_list[i], width=10))
entry_item_cnt_list[i].grid(row=1, column=5)

frame.place(x=0, y=i*70)

save_path_text = StringVar()

frame = Frame(root)
Label(frame, text=" 保存路径: ").grid(row=0)
Entry(frame, textvariable=save_path_text, width=60).grid(row=0, column=1)
Button(frame, text="...", width=1, command=select_path).grid(row=0, column=2)

def get_content():
start_index_list = []
start_item_list = []
item_total_cnt_list = []
for i in range(4):
start_index_list.append(start_index_text_list[i].get())
start_item_list.append(start_item_text_list[i].get())
item_total_cnt_list.append(item_cnt_text_list[i].get())

#检查entry
warning_flag = False
for i in range(4):
if int(start_item_list[i]) > 8:
warning_flag = True
tkinter.messagebox.showwarning("警告", "起始条目不能超过8")

for i in range(4):
if cb_var_list[i].get() == '1' and item_total_cnt_list[i] == '':
warning_flag = True
tkinter.messagebox.showwarning("警告", "请填入 [" + cb_name_list[i] + "] 保存条目数")
if cb_var_list[i].get() == '0' and item_total_cnt_list[i] != '':
warning_flag = True
tkinter.messagebox.showwarning("警告", "请勾选 [" + cb_name_list[i] + "] ")

check_nothing_cnt = 0
for i in range(4):
if cb_var_list[i].get() == '0' and item_total_cnt_list[i] == '':
check_nothing_cnt += 1
if check_nothing_cnt == 4:
warning_flag = True
tkinter.messagebox.showwarning("警告", "请填入‘保存条目数’")

file_save_path = save_path_text.get()
if not file_save_path:
warning_flag = True
tkinter.messagebox.showwarning("警告", "请选择‘保存路径’")

if not warning_flag:
# print(item_total_cnt)
# print(file_save_path)
# 隐藏主窗口及后续message主窗口
root.withdraw()
main_process(cb_name_list, start_index_list, start_item_list, item_total_cnt_list, file_save_path)
tkinter.messagebox.showinfo("保存MHT", "完成")
root.destroy()
exit(0)

Button(frame, text="确定", width=8, command=get_content).grid(row=1, column=1, pady=20)
frame.place(x=0, y=300)

root.mainloop()


if __name__ == '__main__':
main()

-------------The End-------------
🙈坚持原创技术分享,感谢支持🙈
0%