Python Scrapy 多页实践
目标
在 《Python Scrapy 单页实践》基础上添加多页遍历获取数据。
遇见问题
其中由于有中文字符串的判断,会遇见这个问题:
UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
解决方案是添加相应的内容:
import sys
reload(sys)
sys.setdefaultencoding('utf8')
测试过这段代码放哪里都行,感觉还是放头部最好,对代码的整体影响不大。
完整代码
下面是 manhua.py 的代码:
# -*- coding: utf-8 -*-
import scrapy
from manhua.items import ManhuaItem
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class ManhuaSpider(scrapy.Spider):
name = 'manhua'
allowed_domains = ['manhua.dmzj.com']
start_urls = ['https://manhua.dmzj.com/update_1.shtml']
# reload(sys)
# sys.setdefaultencoding('utf8')
def parse(self, response):
# reload(sys)
# sys.setdefaultencoding('utf8')
manhuas = response.xpath('//div[@class="boxdiv1"]')
for manhua in manhuas:
# print("----->>>>>>",manhua)
item = ManhuaItem()
#获取名字
item['name'] = manhua.xpath('./div[@class="picborder"]/a/@title').extract()[0]
#获取链接
tempHref = manhua.xpath('./div[@class="picborder"]/a/@href').extract()[0]
tempHrefArr = tempHref.split('/')
#通过链接长度判断
if len(tempHrefArr) == 2:
#非国漫的
item['uid'] = tempHrefArr[0]
item['gm'] = 0
#item['href'] = "https://manhua.dmzj.com/" + tempHrefArr[0]
else:
#国漫的
#item['href'] = tempHref
item['gm'] = 1
tempUid = tempHrefArr[len(tempHrefArr) - 1]
tempUidArr = tempUid.split('.')
if len(tempUidArr) == 2:
item['uid'] = tempUidArr[0]
else:
item['uid'] = tempUid
#获取时间
test_time = manhua.css('div.pictext > ul > li.numfont > span::text')
if test_time:
item['time'] = test_time.extract()[0]
else:
item['time'] = manhua.css('div.pictext > ul > li.numfont ::text').extract()[0]
yield item
#获取下一页的数据
next_text_7 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[7]/text()').extract()[0]
if next_text_7 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[7]/@href').extract_first()
if next_url:
next_url = response.urljoin(next_url)
yield scrapy.Request(next_url, callback=self.parse)
else:
print('不是7')
else:
next_text_9 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[9]/text()').extract()[0]
if next_text_9 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[9]/@href').extract_first()
if next_url:
next_url = response.urljoin(next_url)
yield scrapy.Request(next_url, callback=self.parse)
else:
print('不是9')
else:
next_text_10 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[10]/text()').extract()[0]
if next_text_10 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[10]/@href').extract_first()
if next_url:
next_url = response.urljoin(next_url)
yield scrapy.Request(next_url, callback=self.parse)
else:
print('不是10')
else:
next_text_11 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[11]/text()').extract()[0]
if next_text_11 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[11]/@href').extract_first()
if next_url:
next_url = response.urljoin(next_url)
yield scrapy.Request(next_url, callback=self.parse)
else:
print('不是11')
else:
next_text_12 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[12]/text()').extract()[0]
if next_text_12 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[12]/@href').extract_first()
if next_url:
next_url = response.urljoin(next_url)
yield scrapy.Request(next_url, callback=self.parse)
else:
print('不是12')
获取下一页的数据的代码,感觉写的很差,就是勉强实现了效果,毕竟对Python基础代码不太熟悉,下次补充更好的代码好了。
更新获取下一页的数据的代码
#获取下一页的数据
next_url = ''
next_text_7 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[7]/text()').extract()[0]
if next_text_7 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[7]/@href').extract_first()
else:
next_text_8 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[8]/text()').extract()[0]
if next_text_8 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[8]/@href').extract_first()
else:
next_text_9 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[9]/text()').extract()[0]
if next_text_9 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[9]/@href').extract_first()
else:
next_text_10 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[10]/text()').extract()[0]
if next_text_10 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[10]/@href').extract_first()
else:
next_text_11 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[11]/text()').extract()[0]
if next_text_11 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[11]/@href').extract_first()
else:
next_text_12 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[12]/text()').extract()[0]
if next_text_12 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[12]/@href').extract_first()
else:
next_text_13 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[13]/text()').extract()[0]
if next_text_13 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[13]/@href').extract_first()
else:
next_url = ''
if len(next_url) > 0:
next_url = response.urljoin(next_url)
yield scrapy.Request(next_url, callback=self.parse)
闹着玩的
#获取下一页的数据
next_url = ''
next_text_1 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[1]/text()').extract()[0]
if next_text_1 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[1]/@href').extract_first()
else:
next_text_2 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[2]/text()').extract()[0]
if next_text_2 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[2]/@href').extract_first()
else:
next_text_3 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[3]/text()').extract()[0]
if next_text_3 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[3]/@href').extract_first()
else:
next_text_4 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[4]/text()').extract()[0]
if next_text_4 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[4]/@href').extract_first()
else:
next_text_5 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[5]/text()').extract()[0]
if next_text_5 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[5]/@href').extract_first()
else:
next_text_6 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[6]/text()').extract()[0]
if next_text_6 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[6]/@href').extract_first()
else:
next_text_7 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[7]/text()').extract()[0]
if next_text_7 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[7]/@href').extract_first()
else:
next_text_8 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[8]/text()').extract()[0]
if next_text_8 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[8]/@href').extract_first()
else:
next_text_9 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[9]/text()').extract()[0]
if next_text_9 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[9]/@href').extract_first()
else:
next_text_10 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[10]/text()').extract()[0]
if next_text_10 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[10]/@href').extract_first()
else:
next_text_11 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[11]/text()').extract()[0]
if next_text_11 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[11]/@href').extract_first()
else:
next_text_12 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[12]/text()').extract()[0]
if next_text_12 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[12]/@href').extract_first()
else:
next_text_13 = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[13]/text()').extract()[0]
if next_text_13 == '下一页':
next_url = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a[13]/@href').extract_first()
else:
next_url = ''
if len(next_url) > 0:
next_url = response.urljoin(next_url)
yield scrapy.Request(next_url, callback=self.parse)
最终获取下一页的数据
#获取下一页的数据
next_url = ''
pagesas = response.xpath('//div[@class="newpic_content"]/div[@class="pages"]/a')
for pagesa in pagesas:
next_text = pagesa.xpath('./text()').extract()[0]
if next_text == '下一页':
next_url = pagesa.xpath('./@href').extract_first()
break
if len(next_url) > 0:
next_url = response.urljoin(next_url)
yield scrapy.Request(next_url, callback=self.parse)