前言

自动化完成目标网页的保存，文件名为标题名，网页可离线打开，并保证不缺损信息。 # 实现方案使用善于爬网页的python实现方式1：urllib保存html urllib是一个包含request、error、parse、robotparser四个模块，关乎网络资源请求的包调用urllib完成版本1

import urllib.request 
  
def getHtml(url):
	headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"}
	request = urllib.request.Request(url, headers=headers)
	response = urllib.request.urlopen(request).read()
	return response

def saveHtml(file_name, file_content):
	with open(file_name.replace('/', '_') + ".html", "wb") as f:
		f.write(file_content)
		
base_url = 'http://www.weain.mil.cn/cggg/zbgg/'
base_path = 'E:\\jkl\\'

for p in range(615675, 615682):
	try:
		result = getHtml(base_url + str(p) + '.html')
		print(base_url + str(p) + '.html')
		path = base_path + str(p)
	#	print(path)
		saveHtml(path, result)
		p = p+1
	except:
		continue

print("done")

问题 1.1 反爬该网站虽然简单，web服务应该还是添加了基本的反爬措施解决：需要在request时添加浏览器headers，将程序行为掩饰为浏览器行为

1.2 html信息不全抓下来html文件信息不全，图片损失，网页框架混乱该方式只适合快速抓取关键字的场景

方式2：Chilkat保存MHTML(mht) MHTML = MIME Encapsulation of Aggregate HTML Documents 将一个多附件网页(图片、flash、Java小程序)存储为单一文档，可用于发送html电子邮件

Chilkat是一个功能强大的有直接提供保存mht格式文件接口的包调用Chilkat完成版本2

# !/usr/bin/python
# -*-coding:utf-8-*-

import sys
import chilkat
import urllib.request

from bs4 import BeautifulSoup

def get_title(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"}
    request = urllib.request.Request(url, headers=headers)
    html = urllib.request.urlopen(request).read()
    soup = BeautifulSoup(html, 'lxml')
    print(soup.find('h1').get_text())
    return soup.find('h1').get_text()

mht = chilkat.CkMht()

success = mht.UnlockComponent("Anything for 30-day trial")
if (success != True):
    print(mht.lastErrorText())
    sys.exit()

base_url = "http://www.weain.mil.cn/cggg/zbgg/"
base_path = "C:\\Users\\x.x-pc\\Desktop\\html_grep\\save\\"

if __name__ == '__main__':
    '''
    print("Please Input Web Page Index: ")
    start = input("please input web page index: ")
    print("start >> " + start)
    print("end   >> " + end)
    print(start + "  " + end);
    sys.exit()
    '''

    for p in range(615675, 615676):
    #   try:
            target_url = base_url + str(p) + '.html'
            save_path = base_path + get_title(target_url) + '.mht'
            print(target_url)
            print(save_path)

            success = mht.GetAndSaveMHT(target_url, save_path)
            if (success != True):
                print(mht.lastErrorText())
                sys.exit()

            p = p + 1
    #    except:
    #        print("Exception");
    #        continue

    print("done")

问题： 2.1 该方式可以较好的解决问题1.2，但是依然有部分图片信息缺损与chrome浏览器直接右键保存为mht格式文件信息差距较大

2.2 该网页采用的是动态加载的方式，发布的采购公告信息使用Javascript加载 Chilkat只能抓取到当前页面，无法抓到动态页面接下来就是想办法抓到动态页面，然后拼接到静态页面 emmm ……

方式3：Selenium Selenium是一个用于Web应用程序测试工具 Selenium测试直接运行在浏览器中，就像真正的用户在操作一样支持的浏览器包括IE（7, 8, 9, 10, 11），Mozilla Firefox，Safari，Google Chrome，Opera等。同时python支持对应的selenium包 selenium直接与chromedriver进行交互，调用chromedriver原生api实现浏览器的操作可以利用浏览器保存完整的mht文件

问题：保存网页时默认获取的是html的title字段，导致每个网页该字段都一样真正的标题则在h1字段，需要修改windows保存窗口的命名栏而操作windows窗口则超过了selenium的能力范畴

为了解决该问题，先后搜查了如下方式： AutoHotKey、AutoIt 其中AutoHotKey是一款免费的、Windows平台下开放源代码的热键脚本语言可以通过脚本调用键鼠、系统接口及程序，并创建基于简单语言的图形化界面的执行程序

该工具非常强大，可以自动化完成高重复性操作，可适用于自动化测试、生产中

AutoHotKey可将该语言脚本例化成exe，供python调用 AutoIt则有对应的pyautoit包，最终实现也是调用AutoItX3.dll

pyautogui包可以操作键鼠，不需要调用其他动态库，且使用相对简单，为该问题最后选取的方案为了提高人机交互，使用tkinter实现了简单的对话框界面。

# -*- coding: utf-8 -*-

import os
import time
import math

import tkinter.messagebox
from tkinter import *
from tkinter.filedialog import askdirectory

import pyautogui
import pyperclip
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service


def init_driver(path, quiet=True):
    service = Service(executable_path=path)
    # service.command_line_args()
    service.start()

    custom_options = webdriver.ChromeOptions()
    custom_options.add_argument("--save-page-as-mhtml")
    custom_options.add_argument("--start-maximized")
    custom_options.add_argument('--disable-infobars')
    custom_options.add_argument('--ignore-certificate-errors')
    custom_options.add_argument('--ignore-ssl-errors')
    if quiet:
        # 静默
        custom_options.add_argument('--headless')
        custom_options.add_argument('--disable-gpu')
        custom_options.add_argument("--hide-scrollbars")

    driver = webdriver.Chrome(executable_path=path, options=custom_options)
    return service, driver


def exit_driver(service, driver):
    driver.quit()
    service.stop()


def save_page_as(name):
    # 保存
    pyautogui.hotkey('ctrl', 's')
    time.sleep(1)
    # pyautogui不支持中文
    pyperclip.copy(name)
    pyautogui.hotkey('ctrl', 'v')
    time.sleep(1)
    pyautogui.hotkey('enter')
    time.sleep(1)
    # 暂不支持文件已存在（覆盖保存）
    # pyautogui.hotkey('y')
    # time.sleep(3)


def save_mht(driver_path, tgt_url_list, file_path_list):
    service, driver = init_driver(driver_path, quiet=False)

    # end_cnt = 1
    opened_chrome_tab_cnt = 0
    allowed_chrome_tab_cnt = 5
    for i in range(4):
        # print("len(tgt_url_list)" + str(i) + " = " + str(len(tgt_url_list[i])))
        print(file_path_list[i])
        if len(tgt_url_list[i]) != 0:
            for url in tgt_url_list[i]:
                driver.get(url)
                opened_chrome_tab_cnt += 1
                title = driver.find_element_by_tag_name('h1')
                print(title.text)
                save_page_as(file_path_list[i] + "\\" + title.text)
                # # 避免结束多开1个新标签页
                # if end_cnt == len(tgt_url_list[i]):
                #     break
                # else:
                #     end_cnt += 1
                # 打开新标签页
                pyautogui.hotkey('ctrl', 't')
                driver.switch_to.window(driver.window_handles[-1])
                # 暂时规避chrome无法修改搜索引擎导致新标签页阻塞连接的问题
                pyperclip.copy(url)
                pyautogui.hotkey('ctrl', 'v')
                pyautogui.hotkey('enter')

                # 处理已开标签避免chrome标签过多将内存耗尽
                if opened_chrome_tab_cnt == allowed_chrome_tab_cnt:
                    handle = driver.current_window_handle
                    while opened_chrome_tab_cnt != 0:
                        driver.switch_to.window(driver.window_handles[0])
                        driver.close()
                        opened_chrome_tab_cnt -= 1
                        time.sleep(2)
                    driver.switch_to.window(handle)

    time.sleep(6)
    exit_driver(service, driver)


def get_tgt_url(driver_path, start_index_list, start_item_list, item_total_cnt_list):
    service, driver = init_driver(driver_path)
    item_per_page_cnt = 8
    index_page_cnt_list = [0, 0, 0, 0]
    tgt_url_list = [[] for i in range(4)]
    home_page_url = "http://www.weain.mil.cn"
    index_page_id_list = ["/cggg/zbgg/", "/cggg/zbgg1/", "/cggg/gzgg/", "/cggg/qtgg/"]

    for i in range(4):
        if item_total_cnt_list[i] != "":
            # 向上取整
            index_page_cnt_list[i] = math.ceil((int(start_item_list[i]) + int(item_total_cnt_list[i]) - 1) / item_per_page_cnt)
            # print(index_page_cnt_list[i])

            href_cnt = 1
            for page_index in range(index_page_cnt_list[i]):
                # 获取索引页地址
                if start_index_list[i] == "1" and page_index == 0:
                    index_page_url = home_page_url + index_page_id_list[i] + "index.html"
                else:
                    index_page_url = home_page_url + index_page_id_list[i] + \
                                     "index_" + str(int(start_index_list[i]) + page_index) + ".html"

                print(index_page_url)
                # 索引页内获取tgt_url
                driver.get(index_page_url)
                html = driver.page_source
                soup = BeautifulSoup(html, "html.parser")
                href_list = soup.find_all(href=re.compile("^" + index_page_id_list[i]), target=True, title=True)
                # print(href_list)
                for j in href_list:
                    # print(j)
                    # print(home_page_url + j['href'])
                    if href_cnt != int(start_item_list[i]):
                        href_cnt += 1
                    else:
                        print(home_page_url + j['href'])
                        tgt_url_list[i].append(home_page_url + j['href'])
                        # print(str(len(tgt_url_list[i])) + " -- " + item_total_cnt_list[i])
                        if len(tgt_url_list[i]) == int(item_total_cnt_list[i]):
                            break

    exit_driver(service, driver)
    # print(tgt_url_list)
    return tgt_url_list


def center_window(root, width, height):
    screenwidth = root.winfo_screenwidth()
    screenheight = root.winfo_screenheight()
    size = '%dx%d+%d+%d' % (width, height, (screenwidth - width) / 2, (screenheight - height) / 2)
    # print(size)
    root.geometry(size)


def main_process(cb_name_list, start_index_list, start_item_list, item_total_cnt_list, file_save_path):
    # 获取当前路径（简化部署默认chromedriver在同一路径）
    pwd = os.path.split(os.path.realpath(__file__))[0]
    chrome_driver_path = pwd + "\\chromedriver.exe"
    # print(chrome_driver_path)
    # 检查chromedriver是否存在
    if not os.path.exists(chrome_driver_path):
        # raise Exception("chromedriver必须与exe同一路径!")
        tkinter.messagebox.showerror("错误", "chromedriver必须与exe同一路径")
        exit(0)

    # 获得当前系统时间创建文件夹以规避文件覆盖问题
    localtime = time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime(time.time()))
    file_save_path = file_save_path + "\\" + localtime
    # print(file_save_path)
    os.mkdir(file_save_path)

    file_save_path_list = []
    for i in range(4):
        file_save_path_list.append(file_save_path + "\\" + cb_name_list[i])
        if item_total_cnt_list[i] != "":
            os.mkdir(file_save_path_list[i])

    # 静默获取target_url
    target_url_list = get_tgt_url(chrome_driver_path, start_index_list, start_item_list, item_total_cnt_list)

    # 保存target_url
    save_mht(chrome_driver_path, target_url_list, file_save_path_list)


def main():
    root = Tk()
    root.title("保存MHT")
    root.resizable(width=False, height=False)
    center_window(root, 550, 400)

    def select_path():
        path_ = askdirectory()
        save_path_text.set(path_.replace("/", "\\"))

    def check():
        for i in range(4):
            if cb_var_list[i].get() == '0':
                entry_item_cnt_list[i].delete(0, END)

    entry_start_index_list = []
    entry_start_item_list = []
    entry_item_cnt_list = []
    cb_name_list = ["招标公告", "中标公告", "更正公告", "其他公告"]
    cb_var_list = [StringVar() for n in range(4)]
    start_index_text_list = [StringVar() for n in range(4)]
    start_item_text_list = [StringVar() for n in range(4)]
    item_cnt_text_list = [StringVar() for n in range(4)]

    for i in range(4):
        frame = Frame(root)
        cb_var_list[i].set(0)
        Checkbutton(frame, text=cb_name_list[i], variable=cb_var_list[i], command=check).grid(row=0)

        Label(frame, text="起始索引页:").grid(row=1, column=0)
        start_index_text_list[i].set("1")
        entry_start_index_list.append(Entry(frame, textvariable=start_index_text_list[i], width=10))
        entry_start_index_list[i].grid(row=1, column=1)

        Label(frame, text="起始条目:").grid(row=1, column=2, padx=20)
        start_item_text_list[i].set("1")
        entry_start_item_list.append(Entry(frame, textvariable=start_item_text_list[i], width=10))
        entry_start_item_list[i].grid(row=1, column=3)

        Label(frame, text="保存条目数:").grid(row=1, column=4, padx=20)
        entry_item_cnt_list.append(Entry(frame, textvariable=item_cnt_text_list[i], width=10))
        entry_item_cnt_list[i].grid(row=1, column=5)

        frame.place(x=0, y=i*70)

    save_path_text = StringVar()

    frame = Frame(root)
    Label(frame, text=" 保存路径:    ").grid(row=0)
    Entry(frame, textvariable=save_path_text, width=60).grid(row=0, column=1)
    Button(frame, text="...", width=1, command=select_path).grid(row=0, column=2)

    def get_content():
        start_index_list = []
        start_item_list = []
        item_total_cnt_list = []
        for i in range(4):
            start_index_list.append(start_index_text_list[i].get())
            start_item_list.append(start_item_text_list[i].get())
            item_total_cnt_list.append(item_cnt_text_list[i].get())

        #检查entry
        warning_flag = False
        for i in range(4):
            if int(start_item_list[i]) > 8:
                warning_flag = True
                tkinter.messagebox.showwarning("警告", "起始条目不能超过8")

        for i in range(4):
            if cb_var_list[i].get() == '1' and item_total_cnt_list[i] == '':
                warning_flag = True
                tkinter.messagebox.showwarning("警告", "请填入 [" + cb_name_list[i] + "] 保存条目数")
            if cb_var_list[i].get() == '0' and item_total_cnt_list[i] != '':
                warning_flag = True
                tkinter.messagebox.showwarning("警告", "请勾选 [" + cb_name_list[i] + "] ")

        check_nothing_cnt = 0
        for i in range(4):
            if cb_var_list[i].get() == '0' and item_total_cnt_list[i] == '':
                check_nothing_cnt += 1
        if check_nothing_cnt == 4:
            warning_flag = True
            tkinter.messagebox.showwarning("警告", "请填入‘保存条目数’")

        file_save_path = save_path_text.get()
        if not file_save_path:
            warning_flag = True
            tkinter.messagebox.showwarning("警告", "请选择‘保存路径’")

        if not warning_flag:
            # print(item_total_cnt)
            # print(file_save_path)
            # 隐藏主窗口及后续message主窗口
            root.withdraw()
            main_process(cb_name_list, start_index_list, start_item_list, item_total_cnt_list, file_save_path)
            tkinter.messagebox.showinfo("保存MHT", "完成")
            root.destroy()
            exit(0)

    Button(frame, text="确定", width=8, command=get_content).grid(row=1, column=1, pady=20)
    frame.place(x=0, y=300)

    root.mainloop()


if __name__ == '__main__':
    main()