Основная работа при создании spider’а, это метод parse, класса Spider, остальное, является вспомогательным. Некоторые свойства, методы решил вынести в базовый класс BaseSpider, чтобы не заниматься ‘copy-paste’, наследование в Python никто не отменял. Это сильно облегчило жизнь. Описывать долго тут нечего, приведу исходный код BaseSpider:
import string
from datetime import datetime
from scrapy.spiders import Spider
from scrapy.http import Request
from scrapy import signals
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
from scrapers.settings import BASE_DATA_PATH
from scrapers.utils import now
from scrapers.templates import SPIDER_INDEX_TEMPLATE
def add_timestamp_to_filename(filename, time_format='%Y%m%d%H%M'):
base, ext = filename.rsplit('.', 1)
timestamp = datetime.now().strftime(time_format)
return f"{base}_{timestamp}.{ext}"
class BaseSpider(Spider):
name = 'basespider'
# Содавать описательную часть для spider
spider_index_html = False
title_page = ""
# Template для описательной части
desc_template = None
title = "<!---->"
description = "<!---->"
item_example = "<!---->"
# Имя файла куда складываются данные после работы spider
file_items_basename = "report"
# Расширение (формат) файла
file_items_ext = "json"
start_urls = [
"http://www.httpbin.org/",
]
def __init__(self, *args, **kwargs):
super(BaseSpider, self).__init__(*args, **kwargs)
self.item_count = 0
# ...
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
"""Настройка Feed Exports
"""
spider = super(BaseSpider, cls).from_crawler(crawler, *args, **kwargs) # noqa
spider.data_dir = BASE_DATA_PATH / spider.name
spider.data_file = spider.data_dir / f"{spider.file_items_basename}.{spider.file_items_ext}" # noqa
data_file = f"file:///{BASE_DATA_PATH}/{spider.name}/{spider.file_items_basename}.{spider.file_items_ext}" # noqa
spider.index_html = spider.data_dir / "index.html"
"""Для данных, которые отдает spider
"""
feeds = {
data_file: {
'format': 'json',
'encoding': 'utf8',
'store_empty': False,
'indent': 4,
'overwrite': True
},
}
"""Для архива данных spider, также через Feed Exports
"""
if spider.settings.get("COPY_TO_ARCHIVE", False):
_a = add_timestamp_to_filename(f"{spider.file_items_basename}.{spider.file_items_ext}") # noqa
data_file_archive = f"file:///{BASE_DATA_PATH}/archive/{spider.name}/{_a}" # noqa
feeds[data_file_archive] = {
'format': 'json',
'encoding': 'utf8',
'store_empty': False,
'indent': 4,
'overwrite': True
}
spider.settings.set("FEEDS", feeds, priority="spider")
crawler.signals.connect(spider.spider_opened, signal=signals.spider_opened) # noqa
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed) # noqa
crawler.signals.connect(spider.item_scraped, signal=signals.item_scraped) # noqa
return spider
def spider_opened(self, spider):
pass
def spider_closed(self, spider):
"""Добавление описательной части spider в index.html,
находящегося в папке spider
"""
if spider.spider_index_html:
a = string.Template(spider.desc_template)
content = a.safe_substitute(
title=spider.title,
description=spider.description,
example=spider.item_example,
file_items_basename=spider.file_items_basename,
file_items_ext=spider.file_items_ext,
last_update=now,
spider_name=spider.name,
total_items=self.item_count
)
t = string.Template(SPIDER_INDEX_TEMPLATE)
result = t.safe_substitute(
title_page=spider.title_page,
content=content
)
with open(spider.index_html, "w") as output:
output.write(result)
def item_scraped(self, item, response, spider):
self.item_count += 1
def start_requests(self):
for url in self.start_urls:
yield Request(
url=url,
callback=self.parse_success,
errback=self.parse_error,
)
def parse_success(self, response):
self.logger.info(
'Got successful response from {}'.format(response.url))
def parse_error(self, failure):
# log all failures
self.logger.error(repr(failure))
# in case you want to do something special for some errors,
# you may need the failure's type:
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError on %s', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError on %s', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError on %s', request.url)
Read more: