• 首页 首页 icon
  • 工具库 工具库 icon
    • IP查询 IP查询 icon
  • 内容库 内容库 icon
    • 快讯库 快讯库 icon
    • 精品库 精品库 icon
    • 问答库 问答库 icon
  • 更多 更多 icon
    • 服务条款 服务条款 icon

Scrapy爬取拉勾网数据并存入本地文件和数据库

武飞扬头像
天才少年137
帮助1

链接:https://pan.百度.com/s/1yMM6DE-8RgVZjXEPHGimMQ 提取码:yyds

爬取代码: lg.py

  1.  
    import scrapy
  2.  
    from sjqx.items import SjqxItem
  3.  
     
  4.  
     
  5.  
    class Sjqxpider(scrapy.Spider):
  6.  
    name = 'lg'
  7.  
    allowed_domains = ['www.lagou.com']
  8.  
     
  9.  
    base_url = 'https://www.lagou.com/beijing-zhaopin/Python/'
  10.  
    page = 1
  11.  
     
  12.  
    def __init__(self):
  13.  
    self.headers = {
  14.  
    "accept": "text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  15.  
    "accept-encoding": "gzip, deflate, br",
  16.  
    "accept-language": "zh-CN,zh;q=0.9",
  17.  
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36",
  18.  
    }
  19.  
     
  20.  
    tmp = "user_trace_token=20211116144103-285b4a8b-cf77-49af-bcc3-a1493b9ac075; _ga=GA1.2.1525634692.1637044883; LGUID=20211116144127-3b1b09bf-d6e5-4da7-b67f-2cdf37a54060; gate_login_token=8c6a5f90a68a54ae79afd8858424331b8c08483ffe6a9a3e0dc1f53e42c6a83b; LG_HAS_LOGIN=1; hasDeliver=0; privacyPolicyPopup=false; RECOMMEND_TIP=true; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; index_location_city=北京; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1637044880,1637111963,1637635397; _gat=1; LGSID=20211123104319-59137906-a5b8-46b6-89bd-6c190379ad6d; PRE_UTM=m_cf_cpt_百度_pcbt; PRE_HOST=www.百度.com; PRE_SITE=https://www.百度.com/other.php?sc.K60000j28Sa2RWF0rhnSz-DlN8uZ4uFXSBtmAQWdjQRmJG_JVHlG3d7YYF4cPiMw0GNdeCsAuu3woK3mcji1sCIt8SpysS7U7E9zRlSaIlsODxV5yTUZJ_pnbFF-qEEuUdcIwUKcAvICIm0fBtvWtTsgXLSuscbiub3pkFmRuCAtAh2uxn_E4wdfNGcIDBr7hYWeNwW-2TzjHJuMHO06-IGdrshj.7Y_NR2Ar5Od663rj6tJQrGvKD77h24SU5WudF6ksswGuh9J4qt7jHzk8sHfGmYt_rE-9kYryqM764TTPqKi_nYQZHuukL0.TLFWgv-b5HDkrfK1ThPGujYknHb0THY0IAYqs2v4VnL30ZN1ugFxIZ-suHYs0A7bgLw4TARqnsKLULFb5TaV8UHPS0KzmLmqnfKdThkxpyfqnHR1nHD3n1fvn0KVINqGujYkPjRsPHbzr0KVgv-b5HDknH6vP1Td0AdYTAkxpyfqnHczP1n0TZuxpyfqn0KGuAnqiDFK0ZKGujYzPfKWpyfqnHbv0APzm1Y3Pjnz&ck=3145.1.83.248.192.244.184.381&dt=1637635396&wd=%E6%8B%89%E5%8B%BE%E7%BD%91&tpl=tpl_12273_25897_22126&l=1531183460&us=linkName%3D%25E6%25A0%2587%25E9%25A2%2598-%25E4%25B8%25BB%25E6%25A0%2587%25E9%25A2%2598%26linkText%3D%25E3%2580%2590%25E6%258B%2589%25E5%258B%25BE%25E6%258B%259B%25E8%2581%2598%25E3%2580%2591%25E5%25AE%2598%25E6%2596%25B9%25E7%25BD%2591%25E7%25AB%2599%2520-%2520%25E4%25BA%2592%25E8%2581%2594%25E7%25BD%2591%25E9%25AB%2598%25E8%2596%25AA%25E5%25A5%25BD%25E5%25B7%25A5%25E4%25BD%259C%25EF%25BC%258C%25E4%25B8%258A%25E6%258B%2589%25E5%258B%25BE!%26linkType%3D; PRE_LAND=https://www.lagou.com/landing-page/pc/search.html?utm_source=m_cf_cpt_百度_pcbt; _putrc=29A9DFB8DFD1D0E6123F89F2B170EADC; JSESSIONID=ABAAABAABEIABCICF0A40B0EA71CD9CF3968BDA37BC6736; login=true; unick=用户8914; WEBTJ-ID=20211123104328-17d4aad40998ca-0db7a8ed41b3ec-978183a-1327104-17d4aad409a44c; X_HTTP_TOKEN=f3a43640e6ad551b01453673610993f4cb3ba8435e; __SAFETY_CLOSE_TIME__23081889=1; _gid=GA1.2.1853327180.1637635408; sensorsdata2015session={}; __lg_stoken__=1ccb553288981424baacaec2ab8e15417b46e5e0f9083c83d084ff93c43c14b1a6e72cae97b09bf5b6f4a122759e16b7415db744e058d09cc3210322104c46c4a074729c5f8c; SEARCH_ID=95c5b5e00e2a4bea983d5072fe38603c; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1637635442; LGRID=20211123104404-f48c9938-3fa6-4b3b-92e1-f85f21347fd9; sensorsdata2015jssdkcross={"distinct_id":"23081889","first_id":"17d2785705314e-0338c5cf8d6c37-57b1a33-1327104-17d27857054377","props":{"$latest_traffic_source_type":"直接流量","$latest_search_keyword":"未取到值_直接打开","$latest_referrer":"","$os":"Windows","$browser":"Chrome","$browser_version":"96.0.4664.45","lagou_company_id":""},"$device_id":"17d2785705314e-0338c5cf8d6c37-57b1a33-1327104-17d27857054377"}"
  21.  
    cookie_list = tmp.split("; ")
  22.  
    self.cookies = {cookie.split("=")[0]: cookie.split("=")[-1] for cookie in cookie_list}
  23.  
    self.url = 'https://www.lagou.com/beijing-zhaopin/Python/?filterOption=3&sid=2b73cbfd62d04652945f522f73392f3b'
  24.  
     
  25.  
    def parse(self, response):
  26.  
    li_list = response.xpath('//ul[@class ="item_con_list"]/li')
  27.  
    for li in li_list:
  28.  
    position = li.xpath('.//div[@class="p_top"]//h3/text()').extract_first().replace(" ","")
  29.  
    place = li.xpath('.//div[@class="p_top"]//a//span//em/text()').extract_first().replace(" ","")
  30.  
    salary = li.xpath('.//div[@class="p_bot"]//span/text()').extract_first().replace(" ","")
  31.  
    requirement = li.xpath('.//div[@class="p_bot"]//text()').extract()[4].replace(" ","").replace("\n","").replace("/",",")
  32.  
    company = li.xpath('.//div[@class="company"]//div[@class="company_name"]/a/text()').extract_first().replace(" ","")
  33.  
    sjqx = SjqxItem(position=position, place=place, salary=salary, requirement=requirement, company=company)
  34.  
    yield sjqx
  35.  
    if self.page < 10:
  36.  
    self.page = self.page 1
  37.  
    url = self.base_url str(self.page) '/?filterOption=3&sid=2b73cbfd62d04652945f522f73392f3b'
  38.  
    # scrapy的get请求
  39.  
    yield scrapy.Request(url=url, callback=self.parse)
  40.  
     
  41.  
    def start_requests(self):
  42.  
    return [scrapy.Request(url=self.url, headers=self.headers, cookies=self.cookies, callback=self.parse)]
学新通

存入本地和数据库代码: pipelines.py,修改setting的通到配置

  1.  
    class SjqxPipeline:
  2.  
    # Before
  3.  
    def open_spider(self, spider):
  4.  
    self.f = open('sjqx.json', 'w', encoding='utf-8')
  5.  
     
  6.  
    # After
  7.  
    def close_spider(self, spider):
  8.  
    self.f.close()
  9.  
     
  10.  
    # item 就是 yield返回的book
  11.  
    def process_item(self, item, spider):
  12.  
    # write必须是字符串
  13.  
    self.f.write(str(item))
  14.  
     
  15.  
    return item
  16.  
     
  17.  
    import pymysql
  18.  
    class SjqxMysqlPipeline:
  19.  
     
  20.  
    # Before
  21.  
    def open_spider(self, spider):
  22.  
    self.db = pymysql.connect(host='localhost', port=3306, user='root', passwd='123456', db='python', charset='utf8')
  23.  
    self.cursor = self.db.cursor()
  24.  
     
  25.  
    # After
  26.  
    def close_spider(self, spider):
  27.  
    self.cursor.close()
  28.  
    self.db.close()
  29.  
     
  30.  
    # item 就是 yield返回的book
  31.  
    def process_item(self, item, spider):
  32.  
    # write必须是字符串
  33.  
    sql = "INSERT INTO `sjqx`(`place`,`company`,`position`,`salary`,`requirement`) values ('%s', '%s', '%s', '%s', '%s')" % (item['place'], item['company'], item['position'],item['salary'],item['requirement'])
  34.  
    self.cursor.execute(sql)
  35.  
    self.db.commit()
  36.  
    return item
学新通

运行命令:进入spider目录执行

scrapy crawl lg

注意:cookie最好自己复制;修改数据库地址,先创建好表格

这篇好文章是转载于:学新通技术网

  • 版权申明: 本站部分内容来自互联网,仅供学习及演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,请提供相关证据及您的身份证明,我们将在收到邮件后48小时内删除。
  • 本站站名: 学新通技术网
  • 本文地址: /boutique/detail/tanhgfiehc
系列文章
更多 icon
同类精品
更多 icon
继续加载