1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
| class RandomUserAgent(object):
def __init__(self, agents):
self.agents = agents
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings.getlist('USER_AGENTS'))
def process_request(self, request, spider):
request.headers.setdefault('User-Agent', random.choice(self.agents))
class ProxyMiddleware(object):
def __init__(self, mysql_host, mysql_db, mysql_user, mysql_passwd):
self.mysql_host = mysql_host
self.mysql_db = mysql_db
self.mysql_user = mysql_user
self.mysql_passwd = mysql_passwd
@classmethod
def from_crawler(cls, crawler):
return cls(
mysql_host=crawler.settings.get('MYSQL_HOST'),
mysql_user=crawler.settings.get('MYSQL_USER'),
mysql_passwd=crawler.settings.get('MYSQL_PASSWD'),
mysql_db=crawler.settings.get('MYSQL_DB')
)
def process_request(self, request, spider):
try:
self.conn = MySQLdb.connect(
user=self.mysql_user,
passwd=self.mysql_passwd,
db=self.mysql_db,
host=self.mysql_host,
charset="utf8",
use_unicode=True
)
self.cursor = self.conn.cursor()
except MySQLdb.Error, e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
self.cursor.execute(
'SELECT * FROM xicidaili order by verified_time DESC Limit 0,10')
proxy_item = self.cursor.fetchall()
proxy = random.choice(proxy_item)
user_pass = proxy[4]
ip = proxy[1]
port = proxy[2]
http_method = proxy[6]
http_method = http_method.lower()
if user_pass is not None:
request.meta['proxy'] = "%s://%s:%s" % (http_method,ip,port)
encoded_user_pass = base64.encodestring(user_pass)
request.headers[
'Proxy-Authorization'] = 'Basic ' + encoded_user_pass
else:
request.meta['proxy'] = "%s://%s:%s" % (http_method,ip,port)
|