1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
| import os import sys import getopt import requests import random import re import html2text from bs4 import BeautifulSoup
useragents = [ 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' ]
def jinashu(url): headers = { 'Host': 'www.jianshu.com', 'Referer': 'https://www.jianshu.com/', 'User-Agent': random.choice(useragents) } html = requests.get(url,headers=headers).text
soup = BeautifulSoup(html,"html5lib") title = soup.find_all("title")[0].get_text() article = str(soup.find_all("div",class_="show-content")[0])
article = re.sub('(src=")|(data-original-src=")','src="https:',article)
pwd = os.getcwd() dirpath = pwd + '/jianshu/' write2md(dirpath,title,article) print(title+"下载完成....") def csdn(url): headers = { 'Host': 'blog.csdn.net', 'Referer': 'http://blog.csdn.net/', 'User-Agent': random.choice(useragents) } html = requests.get(url,headers=headers).text soup = BeautifulSoup(html,'html5lib') title = soup.find_all('title')[0].get_text() article = str(soup.find_all('article')[0])
pwd = os.getcwd() dirpath = pwd + '/CSDN/' write2md(dirpath,title,article) print(title+"下载完成....")
def zhihu(url): headers = { 'Host': 'zhuanlan.zhihu.com', 'Referer': 'https://www.zhihu.com/', 'User-Agent': random.choice(useragents) } html = requests.get(url,headers=headers).text soup = BeautifulSoup(html,'html5lib') title = soup.find_all('title')[0].get_text() article = str(soup.find_all('div',class_='Post-RichText')[0])
pwd = os.getcwd() dirpath = pwd + '/ZhiHu/' write2md(dirpath,title,article) print(title+"下载完成....")
def doelse(url): headers = { 'User-Agent': random.choice(useragents) } res = requests.get(url=url ,headers=headers)
h = html2text.HTML2Text() h.ignore_links = False soup = BeautifulSoup(res.text,'html5lib') title = soup.title.text html = str(soup.body) article = h.handle(html)
pwd = os.getcwd() dirpath = pwd + '/Else/' if not os.path.exists(dirpath): os.makedirs(dirpath) pwd = os.getcwd() dirpath = pwd + '/ELSE/' write2md(dirpath,title,article) print(title+"下载完成....")
""" 传入文件路径,title,article """ def write2md(dirpath,title,article): h2md = html2text.HTML2Text() h2md.ignore_links = False article = h2md.handle(article) if not os.path.exists(dirpath): os.makedirs(dirpath) with open(dirpath+title+'.md','w',encoding="utf8") as f: lines = article.splitlines() for line in lines: if line.endswith('-'): f.write(line) else: f.write(line+"\n")
def main(argv): try: opts,args = getopt.getopt(argv,"hu:",["url"]) except getopt.GetoptError: print("python html2md.py -u <url>") for opt,arg in opts: if opt == "-h": print("python html2md.py -u <url>") sys.exit(2) elif opt in ("-u", "-url"): print() checkSite(arg) else: print("python html2md.py -u <url>")
def checkSite(url): if url.find('csdn') != -1: csdn(url) elif url.find('jianshu') != -1: jinashu(url) elif url.find('zhihu') != -1: zhihu(url) else: doelse(url)
if __name__ == "__main__": main(sys.argv[1:])
|