Python Scrapy 多页实践

目标

在 《Python Scrapy 单页实践》基础上添加多页遍历获取数据。

遇见问题

其中由于有中文字符串的判断,会遇见这个问题:

UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal

解决方案是添加相应的内容:

import sys

reload(sys)
sys.setdefaultencoding('utf8')

测试过这段代码放哪里都行,感觉还是放头部最好,对代码的整体影响不大。

完整代码

下面是 manhua.py 的代码:

# -*- coding: utf-8 -*-
import scrapy
from manhua.items import ManhuaItem

import sys

reload(sys)
sys.setdefaultencoding('utf8')


class ManhuaSpider(scrapy.Spider):
    name = 'manhua'
    allowed_domains = ['manhua.dmzj.com']
    start_urls = ['https://manhua.dmzj.com/update_1.shtml']

    # reload(sys)
    # sys.setdefaultencoding('utf8')

    def parse(self, response):
        # reload(sys)
        # sys.setdefaultencoding('utf8')

        manhuas = response.xpath('//div[@class="boxdiv1"]')
        for manhua in manhuas:
            # print("----->>>>>>",manhua)
            
            item = ManhuaItem()

            #获取名字
            item['name'] = manhua.xpath('./div[@class="picborder"]/a/@title').extract()[0]

            #获取链接
            tempHref = manhua.xpath('./div[@class="picborder"]/a/@href').extract()[0]
            tempHrefArr = tempHref.split('/')
            #通过链接长度判断
            if len(tempHrefArr) == 2:
                #非国漫的
                item['uid'] = tempHrefArr[0]
                item['gm'] = 0
                #item['href'] = "https://manhua.dmzj.com/" + tempHrefArr[0]
            else:
                #国漫的
                #item['href'] = tempHref
                item['gm'] = 1
                tempUid = tempHrefArr[len(tempHrefArr) - 1]
                tempUidArr = tempUid.split('.')
                if len(tempUidArr) == 2:
                    item['uid'] = tempUidArr[0]
                else:
                    item['uid'] = tempUid

            #获取时间
            test_time = manhua.css('div.pictext > ul > li.numfont > span::text')
            if test_time:
                item['time'] = test_time.extract()[0]
            else:
                item['time'] = manhua.css('div.pictext > ul > li.numfont ::text').extract()[0]

            yield item

        #获取下一页的数据
        next_text_7 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[7]/text()').extract()[0]
        if next_text_7 == '下一页':
            next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[7]/@href').extract_first()
            
            if next_url:
                next_url = response.urljoin(next_url)
                yield scrapy.Request(next_url, callback=self.parse)
            else:
                print('不是7')
        else:
            next_text_9 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[9]/text()').extract()[0]
            if next_text_9 == '下一页':
                next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[9]/@href').extract_first()
                if next_url:
                    next_url = response.urljoin(next_url)
                    yield scrapy.Request(next_url, callback=self.parse)
                else:
                    print('不是9')
            else:
                next_text_10 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[10]/text()').extract()[0]
                if next_text_10 == '下一页':
                    next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[10]/@href').extract_first()
                    if next_url:
                        next_url = response.urljoin(next_url)
                        yield scrapy.Request(next_url, callback=self.parse)
                    else:
                        print('不是10')
                else:
                    next_text_11 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[11]/text()').extract()[0]
                    if next_text_11 == '下一页':
                        next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[11]/@href').extract_first()
                        if next_url:
                            next_url = response.urljoin(next_url)
                            yield scrapy.Request(next_url, callback=self.parse)
                        else:
                            print('不是11')
                    else:
                        next_text_12 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[12]/text()').extract()[0]
                        if next_text_12 == '下一页':
                            next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[12]/@href').extract_first()
                            if next_url:
                                next_url = response.urljoin(next_url)
                                yield scrapy.Request(next_url, callback=self.parse)
                            else:
                                print('不是12')


获取下一页的数据的代码,感觉写的很差,就是勉强实现了效果,毕竟对Python基础代码不太熟悉,下次补充更好的代码好了。

更新获取下一页的数据的代码

        #获取下一页的数据
        next_url = ''
        next_text_7 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[7]/text()').extract()[0]
        if next_text_7 == '下一页':
            next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[7]/@href').extract_first()
        else:
            next_text_8 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[8]/text()').extract()[0]
            if next_text_8 == '下一页':
                next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[8]/@href').extract_first()
            else:
                next_text_9 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[9]/text()').extract()[0]
                if next_text_9 == '下一页':
                    next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[9]/@href').extract_first()
                else:
                    next_text_10 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[10]/text()').extract()[0]
                    if next_text_10 == '下一页':
                        next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[10]/@href').extract_first()
                    else:
                        next_text_11 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[11]/text()').extract()[0]
                        if next_text_11 == '下一页':
                            next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[11]/@href').extract_first()
                        else:
                            next_text_12 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[12]/text()').extract()[0]
                            if next_text_12 == '下一页':
                                next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[12]/@href').extract_first()
                            else:
                                next_text_13 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[13]/text()').extract()[0]
                                if next_text_13 == '下一页':
                                    next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[13]/@href').extract_first()
                                else:
                                    next_url = ''

        if len(next_url) > 0:
            next_url = response.urljoin(next_url)
            yield scrapy.Request(next_url, callback=self.parse)
        

闹着玩的

        #获取下一页的数据
        next_url = ''
        next_text_1 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[1]/text()').extract()[0]
        if next_text_1 == '下一页':
            next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[1]/@href').extract_first()
        else:
            next_text_2 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[2]/text()').extract()[0]
            if next_text_2 == '下一页':
                next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[2]/@href').extract_first()
            else:
                next_text_3 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[3]/text()').extract()[0]
                if next_text_3 == '下一页':
                    next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[3]/@href').extract_first()
                else:
                    next_text_4 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[4]/text()').extract()[0]
                    if next_text_4 == '下一页':
                        next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[4]/@href').extract_first()
                    else:
                        next_text_5 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[5]/text()').extract()[0]
                        if next_text_5 == '下一页':
                            next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[5]/@href').extract_first()
                        else:
                            next_text_6 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[6]/text()').extract()[0]
                            if next_text_6 == '下一页':
                                next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[6]/@href').extract_first()
                            else:
                                next_text_7 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[7]/text()').extract()[0]
                                if next_text_7 == '下一页':
                                    next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[7]/@href').extract_first()
                                else:
                                    next_text_8 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[8]/text()').extract()[0]
                                    if next_text_8 == '下一页':
                                        next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[8]/@href').extract_first()
                                    else:
                                        next_text_9 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[9]/text()').extract()[0]
                                        if next_text_9 == '下一页':
                                            next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[9]/@href').extract_first()
                                        else:
                                            next_text_10 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[10]/text()').extract()[0]
                                            if next_text_10 == '下一页':
                                                next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[10]/@href').extract_first()
                                            else:
                                                next_text_11 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[11]/text()').extract()[0]
                                                if next_text_11 == '下一页':
                                                    next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[11]/@href').extract_first()
                                                else:
                                                    next_text_12 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[12]/text()').extract()[0]
                                                    if next_text_12 == '下一页':
                                                        next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[12]/@href').extract_first()
                                                    else:
                                                        next_text_13 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[13]/text()').extract()[0]
                                                        if next_text_13 == '下一页':
                                                            next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[13]/@href').extract_first()
                                                        else:
                                                            next_url = ''

        if len(next_url) > 0:
            next_url = response.urljoin(next_url)
            yield scrapy.Request(next_url, callback=self.parse)
        

最终获取下一页的数据

        #获取下一页的数据
        next_url = ''
        pagesas = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a')
        for pagesa in pagesas:
            next_text = pagesa.xpath('./text()').extract()[0]
            if next_text == '下一页':
                next_url = pagesa.xpath('./@href').extract_first()
                break

        if len(next_url) > 0:
            next_url = response.urljoin(next_url)
            yield scrapy.Request(next_url, callback=self.parse)