#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2019-01-12 19:47:42
# Project: lianjia
from pyspider.libs.base_handler import *
from lxml import etree
import pymongo
class Handler(BaseHandler):
crawl_config = {
}
def __init__(self):
#初始化连接mongo数据库
client = pymongo.MongoClient(host='127.0.0.1',port=27017)
conn =client['lianjia']
self.db = conn['items']
@every(minutes=24 * 60)
def on_start(self):
for i in range(1,101):
self.crawl('https://su.lianjia.com/ershoufang/pg{}'.format(i), callback=self.index_page)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
selector = etree.HTML(response.text)
items = selector.xpath('//div[@class="info clear"]')
info = {}
for item in items:
title = item.xpath('./div/a/text()')
price = item.xpath('./div[@class="priceInfo"]/div[@class="totalPrice"]//text()')
houseinfo = item.xpath('./div[@class="address"]/div[@class="houseInfo"]/text()')
if title:
info['title'] = title[0]
if price:
info['price'] = ''.join(price)
if houseinfo:
info['houseinfo'] = ''.join(houseinfo)
self.db.insert(dict(info))
print(info)
@config(priority=2)
def detail_page(self, response):
return {
"url": response.url,
"title": response.doc('title').text(),
}
网友评论