10.5 网页数据爬虫实战01
作业1
爬取一条腾讯新闻的header内容,存入txt
- 包含网页链接
- 包含title
- 包含所有headers信息
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
import requests
from bs4 import BeautifulSoup
r = requests.get(url = 'http://news.qq.com/a/20170417/002250.htm')
headers = r.headers
#获取头部信息
soup = BeautifulSoup(r.text, 'lxml')
title = soup.title
#获取title内容
f = open('C:\\Users\\Administrator\\Desktop\\headers.txt', 'w')
f.seek(0)
#创建文件夹
f.write('爬取网页:http://news.qq.com/a/20170417/002250.htm\n')
f.write('新闻标题为:'+str(title)+'\n')
for i in headers:
lst = [i, ':', headers[i], '\n']
f.writelines(lst)
f.close
print('finished!')

作业2
爬取腾讯新闻网站上,某一天的某类新闻标题:
- 开头:‘XX年XX月XX日腾讯新闻’
- 包括新闻标题和网址
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
import requests
from bs4 import BeautifulSoup
import re
r = requests.get(url = 'http://news.qq.com/world_index.shtml')
soup = BeautifulSoup(r.text, 'lxml')
f = open('C:\\Users\\Administrator\\Desktop\\news.txt', 'w')
f.seek(0)
f.write('2017年4月17日腾讯新闻\n')
news=soup.find_all('a',href=re.compile('http://news.qq.com/a/20170417/'))
#通过正则获取所有新闻的url
for i in news:
txt=i.text.strip() #strip()用于删去前后空格
if txt=='':
continue
else:
lst=[txt,',url=',i.attrs['href'],'\n']
f.writelines(lst)
f.close()
print('finished!')

如何获取每条新闻的内容(二级网址)
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
import requests
from bs4 import BeautifulSoup
import re
r = requests.get(url = 'http://news.qq.com/world_index.shtml')
soup = BeautifulSoup(r.text, 'lxml')
f = open('C:\\Users\\Administrator\\Desktop\\news.txt', 'w')
f.seek(0)
f.write('2017年4月17日腾讯新闻\n')
news=soup.find_all('a',href=re.compile('http://news.qq.com/a/20170417/'))
#通过正则获取所有新闻的url
for i in news:
txt=i.text.strip() #strip()用于删去前后空格
if txt=='':
continue
else:
u = i.attrs['href']
ur = requests.get(url=u)
usoup = BeautifulSoup(ur.text, 'lxml')
f.write(txt+'\n')
f.write('正文如下:\n')
p = usoup.find('div',id='Cnt-Main-Article-QQ').find_all('p')
for i in p:
f.write(i.text+'\n')
f.write('\n')
f.close()
print('finished!')
网友评论