Python Scrapy系列——爬取整个站点满足条件的url

/ ScrapyPython / 没有评论 / 64浏览

Python Scrapy爬取整个站点满足添加的url,从而根据这些url解析出想要的内容。

代码

# -*- coding: utf-8 -*-
from urllib.parse import urljoin

import scrapy
from scrapy.utils.response import get_base_url

from book.items import BookItem


class ScrapyChSpider(scrapy.Spider):
    name = 'scrapy_ch'
    allowed_domains = ['scrapy-chs.readthedocs.io']
    start_urls = ['http://scrapy-chs.readthedocs.io']

    # 满足条件的url前缀
    need_url_prefix = 'http://scrapy-chs.readthedocs.io/zh_CN/1.0'

    def parse(self, response):
        # 满足条件
        if response.url.startswith(self.need_url_prefix):
            item = BookItem()
            item['url'] = response.url
            yield item

        base_url = get_base_url(response)

        # 获取该页面所有的a标签href属性
        links = response.xpath("//a/@href").extract()
        for link in links:
            # 相对路径处理
            url = urljoin(base_url, link)
            # 继续爬
            yield scrapy.Request(url, self.parse)

以上找到了想要的url。