官方文档
The RetryMiddleware
can be configured through the following settings (see the settings documentation for more info):
源码(以版本V2.5为例)
class RetryMiddleware:
# IOError is raised by the HttpCompression middleware when trying to
# decompress an empty response
EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
ConnectionRefusedError, ConnectionDone, ConnectError,
ConnectionLost, TCPTimedOutError, ResponseFailed,
IOError, TunnelError)
def __init__(self, settings):
if not settings.getbool('RETRY_ENABLED'):
raise NotConfigured
self.max_retry_times = settings.getint('RETRY_TIMES')
self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def process_response(self, request, response, spider):
if request.meta.get('dont_retry', False):
return response
if response.status in self.retry_http_codes:
reason = response_status_message(response.status)
return self._retry(request, reason, spider) or response
return response
def process_exception(self, request, exception, spider):
if (
isinstance(exception, self.EXCEPTIONS_TO_RETRY)
and not request.meta.get('dont_retry', False)
):
return self._retry(request, exception, spider)
def _retry(self, request, reason, spider):
max_retry_times = request.meta.get('max_retry_times', self.max_retry_times)
priority_adjust = request.meta.get('priority_adjust', self.priority_adjust)
return get_retry_request(
request,
reason=reason,
spider=spider,
max_retry_times=max_retry_times,
priority_adjust=priority_adjust,
)
解读
解读源码发现RetryMiddleware 主要有以下功能
- 出现网络,连接相关异常时重试
- 出现指定的状态码< RETRY_HTTP_CODES >时重试;
最大重试次数为< RETRY_TIMES >;
且重试请求可以调整优先级< RETRY_PRIORITY_ADJUST >;
默认不修改优先级