使用scrapy爬取平凡的世界

武飞扬头像

暮晨清雪

2024-04-25 帮助1人

import scrapy
from ..items import PfdsjItem
class PfsjSpider(scrapy.Spider):
name = 'pfsj'
#allowed_domains = ['xxx.com']
start_urls = ['https://www.pingfandeshijie.net']
#获取一级页面中的a链接地址
def parse(self,response):
#获取目录链接
a_href=response.xpath("//center/table/tr/td/center/h2/a/@href").extract()
# print(a_href)
for v in a_href:
# print(v)
# 将返回的a链接交给调度器进行处理，将处理的结果传递给two_parse
yield scrapy.Request(url=v,callback=self.two_parse)
# 获取二级页面中的a链接地址
def two_parse(self,respond):
# print(respond)
# 获取a链接
a_href=respond.xpath('//div[@class="main"]/div[2]/ul/li/a/@href').extract()
# print(a_href)
for i in a_href:
# 将返回的a链接交给调度器进行处理，将处理的结果传递给three_parse
yield scrapy.Request(url=i,callback=self.three_parse)
# 获取三级页面中的a链接地址
def three_parse(self,respond):
# print(type(book_name))
page=respond.xpath('/html/body/div[3]/h1/text()').get().split()
part=page[0]
if len(page)>1:
page_num=page[1]
else:
page_num = page[0]
content=respond.xpath('//body/div[3]/div[2]/p/text()').extract()
content='\n'.join(content)
# print(content)
item = PfdsjItem()
# 给KugouItem对象属性赋值
item['page_num'] = page_num
item['part'] = part
item['content'] = content.replace('\\u300', '')
yield item

学新通

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import os
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class PfdsjPipeline:
# 第一次执行管道类文件的时候执行一次
def open_spider(self,spider):
dirname = './平凡的世界'
if not os.path.exists(dirname):
os.mkdir(dirname)
def process_item(self, item, spider):
dirname = './%s/'%('平凡的世界') item['part']
if not os.path.exists(dirname):
os.mkdir(dirname)
# 章节名/章节数——标题
filename = "./%s/%s/%s" % ('平凡的世界',item['part'],item['page_num'])
with open(filename '.txt', 'a', encoding='utf-8') as f:
f.write(item['content'])

学新通

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class PfdsjItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
book_name = scrapy.Field()
part = scrapy.Field()
page_num = scrapy.Field()
content=scrapy.Field()

学新通

所需第三方库：scrapy库

运行结果：

学新通

学新通

这篇好文章是转载于：学新通技术网

版权申明：本站部分内容来自互联网，仅供学习及演示用，请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系，请提供相关证据及您的身份证明，我们将在收到邮件后48小时内删除。
本站站名：学新通技术网
本文地址： /boutique/detail/tanhgfiach

系列文章

同类精品

继续加载