使用scrapy爬取平凡的世界
-
import scrapy
-
from ..items import PfdsjItem
-
-
-
class PfsjSpider(scrapy.Spider):
-
name = 'pfsj'
-
#allowed_domains = ['xxx.com']
-
start_urls = ['https://www.pingfandeshijie.net']
-
-
#获取一级页面中的a链接地址
-
def parse(self,response):
-
#获取目录链接
-
a_href=response.xpath("//center/table/tr/td/center/h2/a/@href").extract()
-
# print(a_href)
-
for v in a_href:
-
# print(v)
-
# 将返回的a链接交给调度器进行处理,将处理的结果传递给two_parse
-
yield scrapy.Request(url=v,callback=self.two_parse)
-
-
# 获取二级页面中的a链接地址
-
def two_parse(self,respond):
-
# print(respond)
-
# 获取a链接
-
a_href=respond.xpath('//div[@class="main"]/div[2]/ul/li/a/@href').extract()
-
# print(a_href)
-
for i in a_href:
-
# 将返回的a链接交给调度器进行处理,将处理的结果传递给three_parse
-
yield scrapy.Request(url=i,callback=self.three_parse)
-
-
# 获取三级页面中的a链接地址
-
def three_parse(self,respond):
-
# print(type(book_name))
-
page=respond.xpath('/html/body/div[3]/h1/text()').get().split()
-
part=page[0]
-
if len(page)>1:
-
page_num=page[1]
-
else:
-
page_num = page[0]
-
content=respond.xpath('//body/div[3]/div[2]/p/text()').extract()
-
content='\n'.join(content)
-
# print(content)
-
item = PfdsjItem()
-
# 给KugouItem对象属性赋值
-
item['page_num'] = page_num
-
item['part'] = part
-
item['content'] = content.replace('\\u300', '')
-
yield item
-
# Define your item pipelines here
-
#
-
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-
-
import os
-
# useful for handling different item types with a single interface
-
from itemadapter import ItemAdapter
-
-
-
class PfdsjPipeline:
-
# 第一次执行管道类文件的时候执行一次
-
def open_spider(self,spider):
-
dirname = './平凡的世界'
-
if not os.path.exists(dirname):
-
os.mkdir(dirname)
-
-
def process_item(self, item, spider):
-
dirname = './%s/'%('平凡的世界') item['part']
-
if not os.path.exists(dirname):
-
os.mkdir(dirname)
-
# 章节名/章节数——标题
-
filename = "./%s/%s/%s" % ('平凡的世界',item['part'],item['page_num'])
-
with open(filename '.txt', 'a', encoding='utf-8') as f:
-
f.write(item['content'])
-
# Define here the models for your scraped items
-
#
-
# See documentation in:
-
# https://docs.scrapy.org/en/latest/topics/items.html
-
-
import scrapy
-
-
-
class PfdsjItem(scrapy.Item):
-
# define the fields for your item here like:
-
# name = scrapy.Field()
-
book_name = scrapy.Field()
-
part = scrapy.Field()
-
page_num = scrapy.Field()
-
content=scrapy.Field()
所需第三方库:scrapy库
运行结果:
这篇好文章是转载于:学新通技术网
- 版权申明: 本站部分内容来自互联网,仅供学习及演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,请提供相关证据及您的身份证明,我们将在收到邮件后48小时内删除。
- 本站站名: 学新通技术网
- 本文地址: /boutique/detail/tanhgfiach
系列文章
更多
同类精品
更多
-
photoshop保存的图片太大微信发不了怎么办
PHP中文网 06-15 -
《学习通》视频自动暂停处理方法
HelloWorld317 07-05 -
Android 11 保存文件到外部存储,并分享文件
Luke 10-12 -
word里面弄一个表格后上面的标题会跑到下面怎么办
PHP中文网 06-20 -
photoshop扩展功能面板显示灰色怎么办
PHP中文网 06-14 -
微信公众号没有声音提示怎么办
PHP中文网 03-31 -
excel下划线不显示怎么办
PHP中文网 06-23 -
excel打印预览压线压字怎么办
PHP中文网 06-22 -
怎样阻止微信小程序自动打开
PHP中文网 06-13 -
TikTok加速器哪个好免费的TK加速器推荐
TK小达人 10-01