html转markdown

html转markdown

写的一个将博客转成markdown的脚本,目前支持简书,知乎,CSDN,项目地址

使用方法 python html2md.py -u <url>

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
import sys
import getopt
import requests
import random
import re
import html2text
from bs4 import BeautifulSoup

useragents = [
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
]

def jinashu(url):
## 浏览器头部
headers = {
'Host': 'www.jianshu.com',
'Referer': 'https://www.jianshu.com/',
'User-Agent': random.choice(useragents)
}
## 获取网页主体
html = requests.get(url,headers=headers).text

## bs4
soup = BeautifulSoup(html,"html5lib")
title = soup.find_all("title")[0].get_text()
article = str(soup.find_all("div",class_="show-content")[0])

## 替图片的src加上https://方便访问
article = re.sub('(src=")|(data-original-src=")','src="https:',article)

## 写入文件
pwd = os.getcwd() # 获取当前的文件路径
dirpath = pwd + '/jianshu/'
write2md(dirpath,title,article)
print(title+"下载完成....")

def csdn(url):
headers = {
'Host': 'blog.csdn.net',
'Referer': 'http://blog.csdn.net/',
'User-Agent': random.choice(useragents)
}
## 获取网页主体
html = requests.get(url,headers=headers).text

## bs4
soup = BeautifulSoup(html,'html5lib')
title = soup.find_all('title')[0].get_text()
article = str(soup.find_all('article')[0])

## 写入文件
pwd = os.getcwd() # 获取当前的文件路径
dirpath = pwd + '/CSDN/'
write2md(dirpath,title,article)
print(title+"下载完成....")

def zhihu(url):
headers = {
'Host': 'zhuanlan.zhihu.com',
'Referer': 'https://www.zhihu.com/',
'User-Agent': random.choice(useragents)
}
html = requests.get(url,headers=headers).text

## bs4
soup = BeautifulSoup(html,'html5lib')
title = soup.find_all('title')[0].get_text()
article = str(soup.find_all('div',class_='Post-RichText')[0])

## 写入文件
pwd = os.getcwd() # 获取当前的文件路径
dirpath = pwd + '/ZhiHu/'
write2md(dirpath,title,article)
print(title+"下载完成....")

def doelse(url):
headers = {
'User-Agent': random.choice(useragents)
}
res = requests.get(url=url ,headers=headers) # 获取整个html页面

h = html2text.HTML2Text()
h.ignore_links = False
soup = BeautifulSoup(res.text,'html5lib')
title = soup.title.text # 获取标题
html = str(soup.body)
article = h.handle(html)

pwd = os.getcwd() # 获取当前文件的路径
dirpath = pwd + '/Else/'
if not os.path.exists(dirpath):# 判断目录是否存在,不存在则创建新的目录
os.makedirs(dirpath)
## 写入文件
pwd = os.getcwd() # 获取当前的文件路径
dirpath = pwd + '/ELSE/'
write2md(dirpath,title,article)
print(title+"下载完成....")

"""
传入文件路径,title,article
"""
def write2md(dirpath,title,article):
## 创建转换器
h2md = html2text.HTML2Text()
h2md.ignore_links = False
## 转换文档
article = h2md.handle(article)
## 写入文件
if not os.path.exists(dirpath):# 判断目录是否存在,不存在则创建新的目录
os.makedirs(dirpath)
# 创建md文件
with open(dirpath+title+'.md','w',encoding="utf8") as f:
lines = article.splitlines()
for line in lines:
if line.endswith('-'):
f.write(line)
else:
f.write(line+"\n")

def main(argv):
try:
opts,args = getopt.getopt(argv,"hu:",["url"])
except getopt.GetoptError:
print("python html2md.py -u <url>")
for opt,arg in opts:
if opt == "-h":
print("python html2md.py -u <url>")
sys.exit(2)
elif opt in ("-u", "-url"):
print()
checkSite(arg)
else:
print("python html2md.py -u <url>")

## 检查网站,使用哪个下载器
def checkSite(url):
if url.find('csdn') != -1:
csdn(url)
elif url.find('jianshu') != -1:
jinashu(url)
elif url.find('zhihu') != -1:
zhihu(url)
else:
doelse(url)

if __name__ == "__main__":
main(sys.argv[1:])

禁用js复制公式

在浏览器输入chrome://settings/content/javascript,例如禁用cnblog的js,就可以禁用掉markdown公式渲染:

禁用js复制文字

经常遇到网页上禁止转载导致无法复制粘贴的情况(例如知乎)。但是自己想记录笔记怎么办?这个时候,需要按下F12->F1->Preference->Dubugger->Disable Javascript。注意不管关闭调试窗口。

参考

知乎禁止转载的回答怎么复制做笔记? - Icy Volcano的回答 - 知乎

------ 本文结束------
坚持原创技术分享,您的支持将鼓励我继续创作!

欢迎关注我的其它发布渠道