爬取51job信管专业相关岗位的情况进行可视化分析。
采用工具:python、tableau(可视化)、DBeaver(数据库管理软件)
一.数据爬取
数据爬取过程
考虑到requests库进行数据的请求容易被平台反扒发现,从而封锁ip导致数据不能正常爬取。因此采用selenium模拟浏览器,进行数据的采集。总共需要采集的岗位有30终类型,先通过selenium采集每种类型岗位需要采集的总页数。然后利用每种岗位的总页数信息,进行岗位数据的采集。通过模拟浏览器访问51job网页,获取页面的HTML文本,然后采用BeautifulSoup库进行需要采集数据点的的数据的获取,最终将获取到的数据写入数据库中进行存储。
1.1导入相关的库
import requests
from bs4 import BeautifulSoup
import pymysql
import random
from selenium import webdriver
from selenium.webdriver import ChromeOptions
import re
import time
import requests
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
1.2对每个岗位搜索的到的总页数进行爬取
if __name__ == '__main__': #主函数
job=["产品经理","产品助理","交互设计","前端开发","软件设计","IOS开发","业务分析","安卓开发","PHP开发","业务咨询","需求分析","流程设计"
,"售后经理","售前经理","技术支持","ERP实施","实施工程师","IT项目经理","IT项目助理","信息咨询","数据挖掘","数据运营","数据分析","网络营销",
"物流与供应链","渠道管理","电商运营","客户关系管理","新媒体运营","产品运营"]
#总共30个职位的列表
#https://www.pexels.com/
option = ChromeOptions()
UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31"
option.add_argument(f'user-agent={UA}')
option.add_experimental_option('useAutomationExtension', False)
option.add_experimental_option('excludeSwitches', ['enable-automation'])
web = webdriver.Chrome(chrome_options=option) # chrome_options=chrome_opt,,options=option
web.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
web.implicitly_wait(3)
url='https://search.51job.com/list/000000,000000,0000,00,9,99,%E4%BA%A7%E5%93%81%E7%BB%8F%E7%90%86,2,2.html?'
web.get(url)
time.sleep(6)
page_list=[]
for j in job:
for i in range(1, 1 + 1):
#url = "https://search.51job.com/list/000000,000000,0000,00,9,99," + j + ",2," + str(i) + ".html?"
url="https://search.51job.com/list/000000,000000,0000,00,9,99,{},2,{}.html?".format(j, i)
web.get(url)
html = web.page_source
soup = BeautifulSoup(html, "lxml")
text = soup.find_all("script", type="text/javascript")[3].string
# 观察原始代码发现我们需要的数据在 engine_jds 后
page_te=eval(str(text).split("=", 1)[1])["total_page"]
page_list.append(page_te)
print(page_te)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
#得到的page_te列表将用于之后的数据爬取时对应每个职位的爬取页数。
1.3进行爬取数据相关函数的设计
#定义 spider()函数,用于获取每个 url 的 html
def spider(url):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31"}
try:
rep = requests.get(url, headers=headers)
rep.raise_for_status()
rep.encoding = rep.apparent_encoding
txt = rep.text
return txt
except:
print("解析失败")
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
#定义 jiexi()函数,用于解析得到的 html
def jiexi(html, info,name):
soup = BeautifulSoup(html, "lxml")
text = soup.find_all("script", type="text/javascript")[3].string
#观察原始代码发现我们需要的数据在 engine_jds 后
data = eval(str(text).split("=", 1)[1])["engine_jds"]
for d in data:
try:
job_name = d["job_name"].replace("\\", "") # 岗位名称
except:
job_name = " "
try:
company_name = d["company_name"].replace("\\", "") # 公司名称
except:
company_name = " "
try:
providesalary_text = d["providesalary_text"].replace("\\", "") # 薪资
except:
providesalary_text = " "
try:
workarea_text = d["workarea_text"].replace("\\", "") #工作地点
except:
workarea_text = " "
try:
updatedate = d["updatedate"].replace("\\", "") #更新时间
except:
updatedate = " "
try:
jobwelf = d["jobwelf"].replace("\\", "") # 工作待遇
except:
jobwelf = " "
try:
companyind_text = d["companyind_text"].replace("\\", "") # 公司类型
except:
companyind_text = " "
try:
companysize_text = d["companysize_text"].replace("\\", "") # 公司规模
except:
companysize_text = " "
try:
at = d["attribute_text"] # 工作要求
s = ''
for i in range(0, len(at)):
s = s + at[i] + ','
attribute_text = s[:-1]
except:
attribute_text = " "
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
#将每一条岗位数据爬取下的内容以及传入参数 name 作为一个列表,依此加入到 info 列表中
info.append( [ name,job_name, updatedate, company_name, companyind_text, companysize_text, workarea_text, providesalary_text, attribute_text, jobwelf])
- 1
- 2
#将数据存到 MySQL 中名为“51job”的数据库中
def save(info):
#将数据保存到数据库表中对应的列
for data in info :
present_job = data[0] # 当前爬取岗位
job_name = data[1] #岗位
updatedate = data[2] #更新时间
company_name = data[3] # 公司名称
companyind_text = data[4] #公司类型
companysize_text = data[5] #公司规模
workarea_text = data[6] #工作地点
providesalary_text = data[7] #薪资
attribute_text = data[8] #工作要求
jobwelf = data[9] #工作待遇
# 创建 sql 语句
sql = "insert into jobs(当前爬取岗位,岗位,更新时间,公司名称,公司类型,公司规模,工作地点,薪资,工作要求,工作待遇) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
# 执行 sql 语句
cursor.execute(sql, [present_job, job_name, updatedate, company_name, companyind_text, companysize_text,
workarea_text, providesalary_text, attribute_text, jobwelf])
db.commit() # 提交数据
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
1.4进行数据的爬取
if __name__ == '__main__': #主函数
job=["产品经理","产品助理","交互设计","前端开发","软件设计","IOS开发","业务分析","安卓开发","PHP开发","业务咨询","需求分析","流程设计"
,"售后经理","售前经理","技术支持","ERP实施","实施工程师","IT项目经理","IT项目助理","信息咨询","数据挖掘","数据运营","数据分析","网络营销",
"物流与供应链","渠道管理","电商运营","客户关系管理","新媒体运营","产品运营"]
#利用1.2获得的每个岗位对应的总页码数。
page_list=['1141', '62', '169', '619', '356', '61', '229', '64', '56', '356', '1379', '147', '62', '29', '2000', '173', '184', '10', '2', '396', '221', '115', '2000', '381', '5', '295', '1233', '280', '699', '352']
#https://www.pexels.com/
option = ChromeOptions()
UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31"
option.add_argument(f'user-agent={UA}')
option.add_experimental_option('useAutomationExtension', False)
option.add_experimental_option('excludeSwitches', ['enable-automation'])
web = webdriver.Chrome(chrome_options=option) # chrome_options=chrome_opt,,options=option
web.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
web.implicitly_wait(10)
url='https://search.51job.com/list/000000,000000,0000,00,9,99,%E4%BA%A7%E5%93%81%E7%BB%8F%E7%90%86,2,2.html?'
web.get(url)
time.sleep(6)
le=len(job)
#连接数据库
db = pymysql.connect( # 连接数据库host="127.0.0.1", #MySQL 服务器名
user="root", # 用户名
password="12345678", # 密码
database="python上机", # 操作的数据库名称charset="utf8"
)
cursor = db.cursor()
for j in range(23,le):
for i in range(1,int(page_list[j])):#页面
info = []
# url = "https://search.51job.com/list/000000,000000,0000,00,9,99," + j + ",2," + str(i) + ".html?"
url = "https://search.51job.com/list/000000,000000,0000,00,9,99,{},2,{}.html?".format(job[j], i)
web.get(url)
ht = web.page_source
soup = BeautifulSoup(ht, "lxml")
jiexi(ht, info,job[j])
print('岗位{}:{}/{}'.format(j,i,page_list[j]))
time.sleep(2)
save(info)
time.sleep(3)
cursor.close()
# 关闭连接
db.close()
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
1.5数据库爬取到的数据展示
二.数据清洗
2.1清洗相关函数的设计
#引入 pymysql 包
import pymysql
#连接 MySQL 数据库
db = pymysql.connect(
host="127.0.0.1",
user="root", password="12345678",
database="python上机", charset="utf8"
)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
def pipei():
cursor = db.cursor() # 获取操作游标
cursor.execute("select * from jobs") # 从 jobs 表中查询所有内容并保存
results = cursor.fetchall() # 接受全部的返回结果
after_pipei = [] # 建立一个空列表,用来存储匹配后数据
for each_result in results:
if each_result[-1] == '物流与供应链':
if '物流' in each_result[0] or '供应链' in each_result[0]:
after_pipei.append(each_result)
elif each_result[-1] == '新媒体运营' or each_result[-1] == '电商运营':
if '运营' in each_result[0]:
after_pipei.append(each_result)
# 由于在以关键词“电商运营”或“新媒体运营”搜索的岗位信息中包含大量具体电商或新媒体平台名称的岗位名称,如“拼多多运营”“抖音运营”等,因此在这两类岗位名称匹配时我们认为只要岗位名称中包含“运营”就算匹配成功。
elif each_result[-1] == '客户关系管理':
if'客户关系' in each_result[0]:
after_pipei.append(each_result)
elif each_result[-1] == '安卓开发':
if '安卓' in each_result[0] or 'Android' in each_result[0]:
after_pipei.append(each_result)
# 由于在很多公司的招聘岗位中“安卓”会以“Android”英文形式出现,因此,在以“安卓开发”为关键词进行搜索时,我们认为只要包含“安卓”或“Android”开发就算匹配成功。
elif each_result[-1][:-2] in each_result[0] and each_result[-1][-2:]:
after_pipei.append(each_result)
# 剩余岗位需要两个关键词都存在岗位名称中,例如包含“数据”或“分析”在以“数据分析” 为关键词搜索的岗位名称种,我们就认为匹配成功。
cursor.close() # 关闭游标
return after_pipei # 返回匹配后的列
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12