• 首页 首页 icon
  • 工具库 工具库 icon
    • IP查询 IP查询 icon
  • 内容库 内容库 icon
    • 快讯库 快讯库 icon
    • 精品库 精品库 icon
    • 问答库 问答库 icon
  • 更多 更多 icon
    • 服务条款 服务条款 icon

Python-Scrapy 获取历史双色球开奖号码

武飞扬头像
羽丶千落
帮助1

Python-Scrapy 获取历史双色球开奖号码

1-创建项目

在终端中输入创建Scrapy项目的命令:

  scrapy startproject GetBicolorNumber

2-settings文件设置

   ROBOTSTXT_OBEY = False
   DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
    'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
} # 请求头
    
    ITEM_PIPELINES = {
   'GetBicolorNumber.pipelines.GetbicolornumberPipeline': 300,
} # 保存文件所需
    
    LOG_LEVEL="WARNING" # 不想显示日志加上这个

3-Itrm设置

Item设置,设置需要爬取的数据内容,items.py

    issue = scrapy.Field()  # 旗号
    time = scrapy.Field()   # 开奖具体时间
    numbers = scrapy.Field()    # 中奖号码

4. 创建Spider

  • 创建一个Spider,终端上进入GetBicolorNumber/GetBicolorNumber/Spider
  • 输入scrapy genspider bicolor_number http://kaijiang.zhcw.com

5-爬取规则的编写

# -*- coding: utf-8 -*-
import scrapy
from ..items import GetbicolornumberItem
import time

class BicolorNumberSpider(scrapy.Spider):
    name = 'bicolor_number'
    # allowed_domains = [http://kaijiang.zhcw.com']
    handle_httpstatus_list = [404, 500] # 请求返回错误的类型
    start_urls = ['http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html']
    url = "http://kaijiang.zhcw.com/zhcw/html/ssq/list_{}.html"
    page = 1

    def parse(self, response):
        print("Crawl 第:{}页".format(self.page))
        datas_xpath = response.xpath('/html/body/table//tr')  # 数据展示区域
        item = GetbicolornumberItem() # 声明item对象
        for data in datas_xpath[2:-1]:
            issue = data.xpath("./td[1]/text()").extract_first()
            time_data = data.xpath("./td[2]/text()").extract_first()
            numbers = data.xpath("./td[3]//em/text()").extract()
            item['issue'] = issue
            item['time'] = time_data
            item['numbers'] = numbers
            # print(item)
            yield item
        self.page = self.page   1
        next_page = self.url.format(self.page)
        # time.sleep(2)
        if self.page <=145:
            # 请求中加入errback,检查错误代码并发出替代请求。
            yield scrapy.Request(next_page, callback=self.parse,errback=self.after_404)

    def after_404(self, response):
        print(response.url)

学新通

6-pipeline.py文件的编写

文件保存为projects.json。

import codecs
import json

class GetbicolornumberPipeline(object):
    def __init__(self):
        self.file = codecs.open('projects.json', 'w ', encoding="utf-8")

    def process_item(self, item, spider):
        data = json.dumps(dict(item), ensure_ascii=False)   "\n"
        self.file.write(data)
        return item

    def spider_closed(self, spider):
        self.file.close()

7-爬取

命令行进入项目的根目录scrapy crawl bicolor_number
建议:创建一个start.py文件,执行此文件即可

from scrapy import cmdline
# 执行爬虫
cmdline.execute("scrapy crawl bicolor_number".split())

8-数据统计

获取历史蓝色球和红色球的出现次数

# -*- coding: utf-8 -*-
import json
import operator

def get_json(file_path):
    with open(file_path,'r',encoding='utf-8') as jf:
        josn_list = jf.readlines()
    return josn_list

def get_numbersR_dict(number_list):
    numbersR_dict = {}
    for numbers in number_list:
        for number in numbers[:-1]:
            if number in numbersR_dict.keys():
                numbersR_dict[number]  = 1
            else:
                numbersR_dict[number] = 0   
    return numbersR_dict

def get_numbersB_dict(number_list):
    numbersB_dict = {}
    for numbers in number_list:
        if numbers[-1] in numbersB_dict.keys():
            numbersB_dict[numbers[-1]]  = 1
        else:
            numbersB_dict[numbers[-1]] = 0   
    return numbersB_dict

def sort_dictKey(numbers_dict,sort_key):
    result = []
    for k in sort_key:
        if k not in numbers_dict.keys():
            continue
        temp = (k,numbers_dict[k])
        result.append(temp)
    return result


if __name__ == '__main__':
    file_path = r"E:\pyCharm\网络爬虫\test_scrapy\GetBicolorNumber\GetBicolorNumber\projects.json"
    json_list = get_json(file_path)
    number_list = []

    for data in range(len(json_list)):
        dict_bicolor = json.loads(json_list[data])
        number_list.append(dict_bicolor['numbers'])
        
    print("总共有:{}期双色球数据数据".format(len(number_list)))
    numbersR_dict = get_numbersR_dict(number_list)
    numbersB_dict = get_numbersB_dict(number_list)
    
    # 字典排序,排序红球出现次数  导入operator
    numbersR_v = sorted(numbersR_dict.items(),key=operator.itemgetter(1),reverse = True)
    numbersB_v = sorted(numbersB_dict.items(),key=operator.itemgetter(1),reverse = True)
    
    print("红色球出现统计数据:")
    for kv in numbersR_v:
        print(kv[0],":",kv[1])
        
    print("蓝色球出现统计数据:")
    for kv in numbersB_v:
        print(kv[0],":",kv[1])
        
学新通

这篇好文章是转载于:学新通技术网

  • 版权申明: 本站部分内容来自互联网,仅供学习及演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,请提供相关证据及您的身份证明,我们将在收到邮件后48小时内删除。
  • 本站站名: 学新通技术网
  • 本文地址: /boutique/detail/tanhfigkbj
系列文章
更多 icon
同类精品
更多 icon
继续加载