boss 直聘爬虫数据分析职位分析

作者: ygquincy | 来源:发表于2019-03-16 19:55 被阅读0次

boss 直聘爬虫数据分析职位分析
爬虫练习—boss直聘数据相关岗位分析（二）
爬虫-（旧）BOSS直聘数据分析相关职位数据
B端产品如何提升自己的竞争力？
powerbi招聘分析
基于rvest包爬取BOSS直聘-上海里有关“数据分析”的职位信
爬虫练习-boss直聘数据相关岗位分析（一）
Boss 直聘数据岗招聘信息爬取（一）
招聘软件竞品分析
2020年疫情后期数据分析招聘近况分析

首先说明这篇文章的数据来源，是爬虫BOSS直聘"数据分析师"这一职位信息所得来的。并且主要分析了数据分析师总体薪酬情况、不同城市薪酬分布、不同学历薪酬分布、北京上海工作经验薪酬分布情况、北上广深对数据分析职位需求量以及有招聘需求的公司所处行业的词云图分析。

1.数据采集
2.数据清洗与处理
3.数据分析

数据采集

import requests
from fake_useragent import UserAgent
from lxml import etree
import pymysql
import pymongo
import json
import time
from requests import RequestException

mongo_url = 'localhost'
mongo_db = 'zhaopin'

ua = UserAgent()

class Boss(object):
    def __init__(self):
        self.url = 'https://www.zhipin.com/{}/?query=数据分析&page={}'
        self.headers = {'user-agent': ua.random,
           'referer':'https://www.zhipin.com/c101020100/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90&page=1',
           'cookie': ''}
        self.client = pymongo.MongoClient(mongo_url)
        self.db = self.client[mongo_db]
        self.cityList = {'广州':'c101280100','北京':'c101010100','上海':'c101020100','深圳':'c101280600','杭州':'c101210100','天津':'c101030100','西安':'c101110100','苏州':'c101190400','武汉':'c101200100','厦门':'c101230200','长沙':'c101250100','成都':'c101270100','郑州':'c101180100','重庆':'c101040100'}


    # def get_proxy(self):
    #     PROXY_POOL_URL = 'http://localhost:5555/random'
    #     try:
    #         response = requests.get(PROXY_POOL_URL)
    #         if response.status_code == 200:
    #             return response.text
    #     except ConnectionError:
    #         return None


    def get_one_page(self, url):
        try:
            # proxy = self.get_proxy()
            # proxies = {'http': proxy}
            # print(proxies)
            response = requests.get(url, headers=self.headers)
            if response.status_code == 200:
                return response.text

            return None
        except RequestException:
            print("请求错误")

    def parse_one_page(self,html):
        html = etree.HTML(html)
        content = html.xpath("//li/div[@class='job-primary']")

        for con in content:

            pos_name = con.xpath(".//div[@class='job-title']/text()")[0]
            comp_name = con.xpath(".//div[@class='info-company']/div/h3/a/text()")[0]
            salary = con.xpath(".//h3/a/span/text()")[0]
            scale = con.xpath("./div[@class='info-company']//p/text()[last()]")[0]
            education = con.xpath("./div/p/text()[3]")[0]
            industry = con.xpath(".//div[@class='company-text']/p//text()")[0]
            workyear = con.xpath("./div[@class='info-primary']/p/text()")[1]
            location = con.xpath("./div[@class='info-primary']/p/text()")[0]


            item = {'pos_name':pos_name,
                    'comp_name':comp_name,
                    'salary':salary,
                    'scale':scale,
                    'education':education,
                    'industry':industry,
                    'workyear':workyear,
                    'location':location}
            yield item

    def write_to_file(self, item):
        with open('boss.txt', 'a', encoding='utf-8') as f:
            f.write(json.dumps(item, ensure_ascii=False)+'\n')

    def write_to_csv(self, item):
        with open('爬虫BOSS直聘.txt','a', encoding='utf-8') as file:
            line = str(item['pos_name']) + ',' + str(item['comp_name']) + ',' + str(item['salary']) + ',' + \
                   str(item['scale']) + ',' + str(item['education']) + ',' + str(item['industry']) + ',' + \
                   str(item['workyear']) + ',' + str(item['location']) + '\n'
            file.write(line)

    def save_to_mongo(self, item):
        if self.db['boss'].insert(item):
            print("save successfully")

    def save_mo_mysql(self, item):
        conn = pymysql.connect(host='localhost', user='root', password='', db='test7', port=3306,
                               charset='utf8')
        cur = conn.cursor()
        insert_data = "INSERT INTO boss(pos_name, comp_name, salary, scale, education, industry, workyear,location) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)"
        val = (item['pos_name'], item['comp_name'], item['salary'], item['scale'], item['education'], item['industry'], item['workyear'], item['location'])
        cur.execute(insert_data, val)
        conn.commit()

    def run(self):
        title = u'posName,companyName,salary,scale,education,industry,workyear,location'+'\n'
        file = open('%s.txt' % '爬虫BOSS直聘', 'w',encoding='utf-8')  # 创建爬虫拉勾网.txt文件
        file.write(title)
        file.close()


        for city in self.cityList.values():
            for i in range(1,11):
                url = self.url.format(city, i)
            # url = self.url.format(1)
                response = self.get_one_page(url)
                for i in self.parse_one_page(response):
                    self.write_to_csv(i)
                time.sleep(3)


if __name__ == '__main__':
    boss = Boss()
    boss.run()