html转markdown

写的一个将博客转成markdown的脚本，目前支持简书，知乎，CSDN，项目地址

使用方法 python html2md.py -u <url>

import os
import sys
import getopt
import requests
import random
import re
import html2text
from bs4 import BeautifulSoup

useragents = [
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    ]

def jinashu(url):
    ## 浏览器头部
    headers = {
        'Host': 'www.jianshu.com',
        'Referer': 'https://www.jianshu.com/',
        'User-Agent': random.choice(useragents)
    }
    ## 获取网页主体
    html = requests.get(url,headers=headers).text

    ## bs4
    soup = BeautifulSoup(html,"html5lib")
    title = soup.find_all("title")[0].get_text()
    article = str(soup.find_all("div",class_="show-content")[0])

    ## 替图片的src加上https://方便访问
    article = re.sub('(src=")|(data-original-src=")','src="https:',article)

    ## 写入文件
    pwd = os.getcwd() # 获取当前的文件路径
    dirpath = pwd + '/jianshu/'
    write2md(dirpath,title,article)
    print(title+"下载完成....")
    
def csdn(url):
    headers = {
        'Host': 'blog.csdn.net',
        'Referer': 'http://blog.csdn.net/',
        'User-Agent': random.choice(useragents)
    }
    ## 获取网页主体
    html = requests.get(url,headers=headers).text
    
    ## bs4
    soup = BeautifulSoup(html,'html5lib')
    title = soup.find_all('title')[0].get_text()
    article = str(soup.find_all('article')[0])

    ## 写入文件
    pwd = os.getcwd() # 获取当前的文件路径
    dirpath = pwd + '/CSDN/'
    write2md(dirpath,title,article)
    print(title+"下载完成....")

def zhihu(url):
    headers = {
        'Host': 'zhuanlan.zhihu.com',
        'Referer': 'https://www.zhihu.com/',
        'User-Agent': random.choice(useragents)
    }
    html = requests.get(url,headers=headers).text
    
    ## bs4
    soup = BeautifulSoup(html,'html5lib')
    title = soup.find_all('title')[0].get_text()
    article = str(soup.find_all('div',class_='Post-RichText')[0])

    ## 写入文件
    pwd = os.getcwd() # 获取当前的文件路径
    dirpath = pwd + '/ZhiHu/'
    write2md(dirpath,title,article)
    print(title+"下载完成....")

def doelse(url):
    headers = {
        'User-Agent': random.choice(useragents)
    }
    res = requests.get(url=url ,headers=headers) # 获取整个html页面

    h = html2text.HTML2Text()
    h.ignore_links = False
    soup = BeautifulSoup(res.text,'html5lib')
    title = soup.title.text # 获取标题
    html = str(soup.body)
    article = h.handle(html)

    pwd = os.getcwd() # 获取当前文件的路径
    dirpath = pwd + '/Else/'
    if not os.path.exists(dirpath):# 判断目录是否存在，不存在则创建新的目录
        os.makedirs(dirpath)
    ## 写入文件
    pwd = os.getcwd() # 获取当前的文件路径
    dirpath = pwd + '/ELSE/'
    write2md(dirpath,title,article)
    print(title+"下载完成....")

"""
传入文件路径，title，article
"""
def write2md(dirpath,title,article):
    ## 创建转换器
    h2md = html2text.HTML2Text()
    h2md.ignore_links = False
    ## 转换文档
    article = h2md.handle(article)
    ## 写入文件
    if not os.path.exists(dirpath):# 判断目录是否存在，不存在则创建新的目录
        os.makedirs(dirpath)
    # 创建md文件
    with open(dirpath+title+'.md','w',encoding="utf8") as f:
        lines = article.splitlines()
        for line in lines:
            if line.endswith('-'):
                f.write(line)
            else:
                f.write(line+"\n")

def main(argv):
    try:
        opts,args = getopt.getopt(argv,"hu:",["url"])
    except getopt.GetoptError:
        print("python html2md.py -u <url>")
    for opt,arg in opts:
        if opt == "-h":
            print("python html2md.py -u <url>")
            sys.exit(2)
        elif opt in ("-u", "-url"):
            print()
            checkSite(arg)
        else:
            print("python html2md.py -u <url>")

## 检查网站，使用哪个下载器
def checkSite(url):
    if url.find('csdn') != -1:
        csdn(url)
    elif url.find('jianshu') != -1:
        jinashu(url)
    elif url.find('zhihu') != -1:
        zhihu(url)
    else:
        doelse(url)

if __name__ == "__main__":
    main(sys.argv[1:])

禁用js复制公式

在浏览器输入chrome://settings/content/javascript，例如禁用cnblog的js，就可以禁用掉markdown公式渲染：

禁用js复制文字

经常遇到网页上禁止转载导致无法复制粘贴的情况（例如知乎）。但是自己想记录笔记怎么办？这个时候，需要按下F12->F1->Preference->Dubugger->Disable Javascript。注意不管关闭调试窗口。

参考

知乎禁止转载的回答怎么复制做笔记？ - Icy Volcano的回答 - 知乎