Python Scrapy 多爬虫一起运行
0.目标
我们的目标依旧是京东图书销量榜之「计算机与互联网」,
京东提供了多个榜单让我们去知道,畅销榜的情况,那么现在让我们一起把这些畅销榜的数据都爬下来吧。
近24小时畅销榜
http://book.jd.com/booktop/3287-0-0-0-10001-1.html
近1周畅销榜
http://book.jd.com/booktop/3287-0-0-0-10002-1.html
近30日畅销榜
http://book.jd.com/booktop/3287-0-0-0-10003-1.html
1.创建工程
scrapy startproject jdbooksales
2.创建爬虫程序
cd jdbooksales
scrapy genspider booksales book.jd.com
3.设置数据存储模板(jdbooksales/jdbooksales/items.py)
# -*- coding: utf-8 -*-
import scrapy
class JdbooksalesItem(scrapy.Item):
name = scrapy.Field()
4.管道处理(jdbooksales/jdbooksales/pipelines.py)
# -*- coding: utf-8 -*-
import codecs
import json
class Jdbookssales24ipeline(object):
def __init__(self):
self.file = codecs.open('jd24.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.file.close()
class Jdbookssales1ipeline(object):
def __init__(self):
self.file = codecs.open('jd1.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.file.close()
class Jdbookssales30ipeline(object):
def __init__(self):
self.file = codecs.open('jd30.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.file.close()
这里有三个管道处理的过程都是一样的,不同在于把数据存储在指定的文件上。
PS:有机会看看怎么优化一下这里。
5.爬虫
在 jdbooksales/jdbooksales/spiders 目录下创建文件:
jdbooksales1.py、jdbooksales24.py、jdbooksales30.py
该三只爬虫代表要爬的三个主题。
jdbooksales1.py:
# -*- coding: utf-8 -*-
import scrapy
from jdbooksales.items import JdbooksalesItem
class Jd1Spider(scrapy.Spider):
name = 'jd1'
custom_settings = {
'ITEM_PIPELINES':{'jdbooksales.pipelines.Jdbookssales1ipeline': 500},
}
allowed_domains = ['book.jd.com']
start_urls = ['http://book.jd.com/booktop/0-0-0.html?category=3287-0-0-0-10002-1']
def parse(self, response):
books = response.xpath('//div[@class="mc"]/ul[@class="clearfix"]/li')
for book in books:
item = JdbooksalesItem()
#获取名字
item['name'] = book.xpath('./div[@class="p-detail"]/a/text()').extract()[0]
yield item
#获取下一页的数据
next_url = response.xpath('//a[@class="pn-next"][1]/@href').extract()[0]
if len(next_url) > 0:
next_url = response.urljoin(next_url)
yield scrapy.Request(next_url, callback=self.parse)
jdbooksales24.py:
# -*- coding: utf-8 -*-
import scrapy
from jdbooksales.items import JdbooksalesItem
class Jd24Spider(scrapy.Spider):
name = 'jd24'
custom_settings = {
'ITEM_PIPELINES':{'jdbooksales.pipelines.Jdbookssales24ipeline': 400},
}
allowed_domains = ['book.jd.com']
start_urls = ['http://book.jd.com/booktop/0-0-0.html?category=3287-0-0-0-10001-1']
def parse(self, response):
books = response.xpath('//div[@class="mc"]/ul[@class="clearfix"]/li')
for book in books:
item = JdbooksalesItem()
#获取名字
item['name'] = book.xpath('./div[@class="p-detail"]/a/text()').extract()[0]
yield item
#获取下一页的数据
next_url = response.xpath('//a[@class="pn-next"][1]/@href').extract()[0]
if len(next_url) > 0:
next_url = response.urljoin(next_url)
yield scrapy.Request(next_url, callback=self.parse)
jdbooksales30.py:
# -*- coding: utf-8 -*-
import scrapy
from jdbooksales.items import JdbooksalesItem
class Jd30Spider(scrapy.Spider):
name = 'jd30'
custom_settings = {
'ITEM_PIPELINES':{'jdbooksales.pipelines.Jdbookssales30ipeline': 300},
}
allowed_domains = ['book.jd.com']
start_urls = ['http://book.jd.com/booktop/0-0-0.html?category=3287-0-0-0-10003-1']
def parse(self, response):
books = response.xpath('//div[@class="mc"]/ul[@class="clearfix"]/li')
for book in books:
item = JdbooksalesItem()
#获取名字
item['name'] = book.xpath('./div[@class="p-detail"]/a/text()').extract()[0]
yield item
#获取下一页的数据
next_url = response.xpath('//a[@class="pn-next"][1]/@href').extract()[0]
if len(next_url) > 0:
next_url = response.urljoin(next_url)
yield scrapy.Request(next_url, callback=self.parse)
每一只爬虫通过内部设置 custom_settings 属性来指定 ITEM_PIPELINES(项目管道),此时就不需要再在 jdbooksales/jdbooksales/settings.py 里面再设置 ITEM_PIPELINES 了。一只爬虫对应一个管道,或多个管道。
6.添加多执行多脚本执行的命令
在 jdbooksales/jdbooksales/ 目录下创建文件夹:commands。
在 commands 文件夹下创建两个文件:__init__.py 和 crawlall.py。
init.py 文件不用填写任何内容,只是表明当前是一个类。
crawlall.py 的内容如下:
# -*- coding: utf-8 -*-
from scrapy.commands import ScrapyCommand
from scrapy.crawler import CrawlerRunner
from scrapy.utils.conf import arglist_to_dict
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders'
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
help="set spider argument (may be repeated)")
parser.add_option("-o", "--output", metavar="FILE",
help="dump scraped items into FILE (use - for stdout)")
parser.add_option("-t", "--output-format", metavar="FORMAT",
help="format to use for dumping items with -o")
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
try:
opts.spargs = arglist_to_dict(opts.spargs)
except ValueError:
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
def run(self, args, opts):
#settings = get_project_settings()
spider_loader = self.crawler_process.spider_loader
for spidername in args or spider_loader.list():
print ("*********cralall spidername************" + spidername)
self.crawler_process.crawl(spidername, **opts.spargs)
self.crawler_process.start()
在 jdbooksales/jdbooksales/ 目录下创建文件:setup.py。
setup.py 的内容如下:
# -*- coding: utf-8 -*-
from setuptools import setup, find_packages
setup(name='scrapy-mymodule',
entry_points={
'scrapy.commands': [
'crawlall=jdbooksales.commands:crawlall',
],
},
)
打开 jdbooksales/jdbooksales/settings.py,
头部添加:
COMMANDS_MODULE = 'jdbooksales.commands'
7.在执行前要检查下面的设置弄好没有
关闭 robots.text 的检测
ROBOTSTXT_OBEY = False
设置延迟下载(模拟真人操作,减少服务器压力)
DOWNLOAD_DELAY = 3
没有必要就关闭 cookies
COOKIES_ENABLED = False
具体响应的设置就要看具体的情况而定。
8.执行命令吧!
在 jdbooksales 根目录下,执行:
scrapy crawlall
便会在该目录下多了名为 jd1.json、jd24.json、jd30.json的文件了。
9.外传 之 crawlall.py 的其他写法:
# -*- coding: utf-8 -*-
from scrapy.commands import ScrapyCommand
from scrapy.utils.project import get_project_settings
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders'
def run(self, args, opts):
spider_list = self.crawler_process.spiders.list()
for name in spider_list:
self.crawler_process.crawl(name, **opts.__dict__)
self.crawler_process.start()