Scrapy is the most powerful Python web scraping framework. Configure proxy middleware for rotation, authentication, and reliable data extraction.
# settings.py
# Enable HTTP proxy
HTTP_PROXY = 'http://your_username:your_password@proxy.proxies.sx:10001'
HTTPS_PROXY = 'http://your_username:your_password@proxy.proxies.sx:10001'
# Or use SOCKS5 (requires scrapy-socks)
# HTTP_PROXY = 'socks5://your_username:your_password@proxy.proxies.sx:10001'
# Enable the HttpProxyMiddleware
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
}
# Concurrent requests (adjust based on your proxy plan)
CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 8
# Download delay (be respectful)
DOWNLOAD_DELAY = 0.5
# Retry settings
RETRY_ENABLED = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429]# middlewares.py
import random
class RotatingProxyMiddleware:
def __init__(self, proxy_list):
self.proxy_list = proxy_list
@classmethod
def from_crawler(cls, crawler):
username = crawler.settings.get('PROXY_USERNAME')
password = crawler.settings.get('PROXY_PASSWORD')
ports = crawler.settings.getlist('PROXY_PORTS')
proxy_list = [
f"http://{username}:{password}@proxy.proxies.sx:{port}"
for port in ports
]
return cls(proxy_list)
def process_request(self, request, spider):
proxy = random.choice(self.proxy_list)
request.meta['proxy'] = proxy
spider.logger.debug(f'Using proxy: {proxy.split("@")[1]}')# settings.py - Add middleware
PROXY_USERNAME = 'your_username'
PROXY_PASSWORD = 'your_password'
PROXY_PORTS = [10001, 10002, 10003, 10004, 10005]
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.RotatingProxyMiddleware': 100,
}# spiders/myspider.py
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
def start_requests(self):
urls = ['https://example.com/page1', 'https://example.com/page2']
for url in urls:
yield scrapy.Request(
url=url,
callback=self.parse,
meta={
'proxy': 'http://user:pass@proxy.proxies.sx:10001'
}
)
def parse(self, response):
# Process response
yield {
'url': response.url,
'title': response.css('title::text').get()
}# Install scrapy-socks: pip install scrapy-socks
# settings.py
DOWNLOADER_MIDDLEWARES = {
'scrapy_socks.downloadermiddlewares.socks.SOCKSDownloaderMiddleware': 100,
}
# In spider or middleware
def start_requests(self):
yield scrapy.Request(
url='https://httpbin.org/ip',
callback=self.parse,
meta={
'proxy': 'socks5://user:pass@proxy.proxies.sx:10001'
}
)# pip install scrapy-rotating-proxies
# settings.py
DOWNLOADER_MIDDLEWARES = {
'rotating_proxies.middlewares.RotatingProxyMiddleware': 610,
'rotating_proxies.middlewares.BanDetectionMiddleware': 620,
}
ROTATING_PROXY_LIST = [
'http://user:pass@proxy.proxies.sx:10001',
'http://user:pass@proxy.proxies.sx:10002',
'http://user:pass@proxy.proxies.sx:10003',
'http://user:pass@proxy.proxies.sx:10004',
'http://user:pass@proxy.proxies.sx:10005',
]
# How many times to retry with different proxies
ROTATING_PROXY_PAGE_RETRY_TIMES = 5
# Ban detection (optional)
# ROTATING_PROXY_BAN_POLICY = 'myproject.policy.MyBanPolicy'# spiders/product_spider.py
import scrapy
from scrapy import signals
import random
class ProductSpider(scrapy.Spider):
name = 'products'
allowed_domains = ['example.com']
custom_settings = {
'CONCURRENT_REQUESTS': 8,
'DOWNLOAD_DELAY': 1,
'RANDOMIZE_DOWNLOAD_DELAY': True,
}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.proxy_ports = [10001, 10002, 10003, 10004, 10005]
self.proxy_base = 'http://user:pass@proxy.proxies.sx'
def get_random_proxy(self):
port = random.choice(self.proxy_ports)
return f'{self.proxy_base}:{port}'
def start_requests(self):
urls = [
'https://example.com/products/1',
'https://example.com/products/2',
'https://example.com/products/3',
]
for url in urls:
yield scrapy.Request(
url=url,
callback=self.parse,
meta={
'proxy': self.get_random_proxy(),
'max_retry_times': 5
},
errback=self.handle_error
)
def parse(self, response):
yield {
'url': response.url,
'title': response.css('h1::text').get(),
'price': response.css('.price::text').get(),
'proxy_used': response.meta.get('proxy', '').split('@')[1]
}
# Follow pagination
next_page = response.css('a.next::attr(href)').get()
if next_page:
yield response.follow(
next_page,
callback=self.parse,
meta={'proxy': self.get_random_proxy()}
)
def handle_error(self, failure):
self.logger.error(f'Request failed: {failure.request.url}')Implement proxy logic in middleware, not in spiders. Keeps spiders clean and allows easy configuration changes.
Detect bans (CAPTCHA pages, 403s) and rotate to a new proxy. scrapy-rotating-proxies handles this automatically.
Even with mobile proxies, add delays between requests. Use DOWNLOAD_DELAY and RANDOMIZE_DOWNLOAD_DELAY.
Track which proxies succeed or fail. Use this data to optimize your proxy allocation and detect issues.
Get mobile proxies for reliable large-scale web scraping.