import urllib3
urllib3.disable_warnings()
import re
url= 'https://www.jianshu.com'
headers= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'
}
pool= urllib3.PoolManager()
resp= pool.request('GET',url,headers=headers)
url_content= resp.data.decode()
# print(url_content)
# title =re.findall(r'
title=re.findall(r'<a class="title" target="_blank.*?">(.*?)</a>',url_content)
content= re.findall(r'<p class="abstract">(.*?)</p>',url_content,re.S)
# print(title)
j=0
for iin title:
print(i)
print(content[j])
print('=============================================================')
j+=1
urllib
from urllibimport request
import re
url= "https://www.jianshu.com"
headers= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'
}
req= request.Request(url,headers=headers)
resp=request.urlopen(req)
page= resp.read().decode()
# print(page)
res= re.findall(r'<a class="title" target="_blank" .*?>(.*?)</a>.*?<p class="abstract">(.*?)</p>',page,re.S)
for title,articlein res:
print(title)
print(article)
print('=====================================')
网友评论