Scrapy爬取拉勾网数据并存入本地文件和数据库
链接:https://pan.百度.com/s/1yMM6DE-8RgVZjXEPHGimMQ 提取码:yyds
爬取代码: lg.py
-
import scrapy
-
from sjqx.items import SjqxItem
-
-
-
class Sjqxpider(scrapy.Spider):
-
name = 'lg'
-
allowed_domains = ['www.lagou.com']
-
-
base_url = 'https://www.lagou.com/beijing-zhaopin/Python/'
-
page = 1
-
-
def __init__(self):
-
self.headers = {
-
"accept": "text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-
"accept-encoding": "gzip, deflate, br",
-
"accept-language": "zh-CN,zh;q=0.9",
-
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36",
-
}
-
-
tmp = "user_trace_token=20211116144103-285b4a8b-cf77-49af-bcc3-a1493b9ac075; _ga=GA1.2.1525634692.1637044883; LGUID=20211116144127-3b1b09bf-d6e5-4da7-b67f-2cdf37a54060; gate_login_token=8c6a5f90a68a54ae79afd8858424331b8c08483ffe6a9a3e0dc1f53e42c6a83b; LG_HAS_LOGIN=1; hasDeliver=0; privacyPolicyPopup=false; RECOMMEND_TIP=true; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; index_location_city=北京; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1637044880,1637111963,1637635397; _gat=1; LGSID=20211123104319-59137906-a5b8-46b6-89bd-6c190379ad6d; PRE_UTM=m_cf_cpt_百度_pcbt; PRE_HOST=www.百度.com; PRE_SITE=https://www.百度.com/other.php?sc.K60000j28Sa2RWF0rhnSz-DlN8uZ4uFXSBtmAQWdjQRmJG_JVHlG3d7YYF4cPiMw0GNdeCsAuu3woK3mcji1sCIt8SpysS7U7E9zRlSaIlsODxV5yTUZJ_pnbFF-qEEuUdcIwUKcAvICIm0fBtvWtTsgXLSuscbiub3pkFmRuCAtAh2uxn_E4wdfNGcIDBr7hYWeNwW-2TzjHJuMHO06-IGdrshj.7Y_NR2Ar5Od663rj6tJQrGvKD77h24SU5WudF6ksswGuh9J4qt7jHzk8sHfGmYt_rE-9kYryqM764TTPqKi_nYQZHuukL0.TLFWgv-b5HDkrfK1ThPGujYknHb0THY0IAYqs2v4VnL30ZN1ugFxIZ-suHYs0A7bgLw4TARqnsKLULFb5TaV8UHPS0KzmLmqnfKdThkxpyfqnHR1nHD3n1fvn0KVINqGujYkPjRsPHbzr0KVgv-b5HDknH6vP1Td0AdYTAkxpyfqnHczP1n0TZuxpyfqn0KGuAnqiDFK0ZKGujYzPfKWpyfqnHbv0APzm1Y3Pjnz&ck=3145.1.83.248.192.244.184.381&dt=1637635396&wd=%E6%8B%89%E5%8B%BE%E7%BD%91&tpl=tpl_12273_25897_22126&l=1531183460&us=linkName%3D%25E6%25A0%2587%25E9%25A2%2598-%25E4%25B8%25BB%25E6%25A0%2587%25E9%25A2%2598%26linkText%3D%25E3%2580%2590%25E6%258B%2589%25E5%258B%25BE%25E6%258B%259B%25E8%2581%2598%25E3%2580%2591%25E5%25AE%2598%25E6%2596%25B9%25E7%25BD%2591%25E7%25AB%2599%2520-%2520%25E4%25BA%2592%25E8%2581%2594%25E7%25BD%2591%25E9%25AB%2598%25E8%2596%25AA%25E5%25A5%25BD%25E5%25B7%25A5%25E4%25BD%259C%25EF%25BC%258C%25E4%25B8%258A%25E6%258B%2589%25E5%258B%25BE!%26linkType%3D; PRE_LAND=https://www.lagou.com/landing-page/pc/search.html?utm_source=m_cf_cpt_百度_pcbt; _putrc=29A9DFB8DFD1D0E6123F89F2B170EADC; JSESSIONID=ABAAABAABEIABCICF0A40B0EA71CD9CF3968BDA37BC6736; login=true; unick=用户8914; WEBTJ-ID=20211123104328-17d4aad40998ca-0db7a8ed41b3ec-978183a-1327104-17d4aad409a44c; X_HTTP_TOKEN=f3a43640e6ad551b01453673610993f4cb3ba8435e; __SAFETY_CLOSE_TIME__23081889=1; _gid=GA1.2.1853327180.1637635408; sensorsdata2015session={}; __lg_stoken__=1ccb553288981424baacaec2ab8e15417b46e5e0f9083c83d084ff93c43c14b1a6e72cae97b09bf5b6f4a122759e16b7415db744e058d09cc3210322104c46c4a074729c5f8c; SEARCH_ID=95c5b5e00e2a4bea983d5072fe38603c; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1637635442; LGRID=20211123104404-f48c9938-3fa6-4b3b-92e1-f85f21347fd9; sensorsdata2015jssdkcross={"distinct_id":"23081889","first_id":"17d2785705314e-0338c5cf8d6c37-57b1a33-1327104-17d27857054377","props":{"$latest_traffic_source_type":"直接流量","$latest_search_keyword":"未取到值_直接打开","$latest_referrer":"","$os":"Windows","$browser":"Chrome","$browser_version":"96.0.4664.45","lagou_company_id":""},"$device_id":"17d2785705314e-0338c5cf8d6c37-57b1a33-1327104-17d27857054377"}"
-
cookie_list = tmp.split("; ")
-
self.cookies = {cookie.split("=")[0]: cookie.split("=")[-1] for cookie in cookie_list}
-
self.url = 'https://www.lagou.com/beijing-zhaopin/Python/?filterOption=3&sid=2b73cbfd62d04652945f522f73392f3b'
-
-
def parse(self, response):
-
li_list = response.xpath('//ul[@class ="item_con_list"]/li')
-
for li in li_list:
-
position = li.xpath('.//div[@class="p_top"]//h3/text()').extract_first().replace(" ","")
-
place = li.xpath('.//div[@class="p_top"]//a//span//em/text()').extract_first().replace(" ","")
-
salary = li.xpath('.//div[@class="p_bot"]//span/text()').extract_first().replace(" ","")
-
requirement = li.xpath('.//div[@class="p_bot"]//text()').extract()[4].replace(" ","").replace("\n","").replace("/",",")
-
company = li.xpath('.//div[@class="company"]//div[@class="company_name"]/a/text()').extract_first().replace(" ","")
-
sjqx = SjqxItem(position=position, place=place, salary=salary, requirement=requirement, company=company)
-
yield sjqx
-
if self.page < 10:
-
self.page = self.page 1
-
url = self.base_url str(self.page) '/?filterOption=3&sid=2b73cbfd62d04652945f522f73392f3b'
-
# scrapy的get请求
-
yield scrapy.Request(url=url, callback=self.parse)
-
-
def start_requests(self):
-
return [scrapy.Request(url=self.url, headers=self.headers, cookies=self.cookies, callback=self.parse)]
存入本地和数据库代码: pipelines.py,修改setting的通到配置
-
class SjqxPipeline:
-
# Before
-
def open_spider(self, spider):
-
self.f = open('sjqx.json', 'w', encoding='utf-8')
-
-
# After
-
def close_spider(self, spider):
-
self.f.close()
-
-
# item 就是 yield返回的book
-
def process_item(self, item, spider):
-
# write必须是字符串
-
self.f.write(str(item))
-
-
return item
-
-
import pymysql
-
class SjqxMysqlPipeline:
-
-
# Before
-
def open_spider(self, spider):
-
self.db = pymysql.connect(host='localhost', port=3306, user='root', passwd='123456', db='python', charset='utf8')
-
self.cursor = self.db.cursor()
-
-
# After
-
def close_spider(self, spider):
-
self.cursor.close()
-
self.db.close()
-
-
# item 就是 yield返回的book
-
def process_item(self, item, spider):
-
# write必须是字符串
-
sql = "INSERT INTO `sjqx`(`place`,`company`,`position`,`salary`,`requirement`) values ('%s', '%s', '%s', '%s', '%s')" % (item['place'], item['company'], item['position'],item['salary'],item['requirement'])
-
self.cursor.execute(sql)
-
self.db.commit()
-
return item
运行命令:进入spider目录执行
scrapy crawl lg
注意:cookie最好自己复制;修改数据库地址,先创建好表格
这篇好文章是转载于:学新通技术网
- 版权申明: 本站部分内容来自互联网,仅供学习及演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,请提供相关证据及您的身份证明,我们将在收到邮件后48小时内删除。
- 本站站名: 学新通技术网
- 本文地址: /boutique/detail/tanhgfiehc
系列文章
更多
同类精品
更多
-
photoshop保存的图片太大微信发不了怎么办
PHP中文网 06-15 -
《学习通》视频自动暂停处理方法
HelloWorld317 07-05 -
Android 11 保存文件到外部存储,并分享文件
Luke 10-12 -
word里面弄一个表格后上面的标题会跑到下面怎么办
PHP中文网 06-20 -
photoshop扩展功能面板显示灰色怎么办
PHP中文网 06-14 -
微信公众号没有声音提示怎么办
PHP中文网 03-31 -
excel下划线不显示怎么办
PHP中文网 06-23 -
excel打印预览压线压字怎么办
PHP中文网 06-22 -
TikTok加速器哪个好免费的TK加速器推荐
TK小达人 10-01 -
怎样阻止微信小程序自动打开
PHP中文网 06-13