python 爬取免费代理

使用python爬取网站,可以用handler来隐藏user-agent,和使用高匿的代理ip,下面就是用不同的user-agent来爬取免费代理

首先要有一个user-agent的列表,我把它放在同一目录下然后import导入

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import requests
import user_agent_list
import re
import random


class SpiderProxy():
def __init__(self):
self.url = ["https://www.kuaidaili.com/free/inha/1/", "https://www.7yip.cn/free/?action=china&page=2",
"https://www.7yip.cn/free/?action=china&page=3", ]
self.header = user_agent_list.getheaders()
self.proxy = {}
self.proxies_list = []
self.run()

def get_proxies_list(self, url):
try:
response = requests.get(url, headers=self.header, timeout=3)
response_data = response.content.decode('utf-8')

ip_list = re.findall(r'<td data-title="IP">(.*?)</td>', response_data, re.S)
port_list = re.findall(r'<td data-title="PORT">(.*?)</td>', response_data, re.S)
type_list = re.findall(r'<td data-title="类型">(.*?)</td>', response_data, re.S)

self.proxies_list = []

for index in range(len(ip_list)):
self.proxies_list.append("{\'%s\':\'%s:%s\'}" % (type_list[index], ip_list[index], port_list[index]))

except Exception as e:
print(e)

# for tmp_proxy in tmp_list:
# proxy = eval(tmp_proxy)
# response = requests.get("www.baidu.com", headers=self.header, proxies=proxy)
# if response.status_code == 200:
# self.proxies_list.append(tmp_proxy)

def get_proxy(self):
while 1:
self.proxy = eval(random.choice(self.proxies_list))
try:
response = requests.get("http://www.baidu.com", headers=self.header, proxies=self.proxy, timeout=3)
if response.status_code == 200:
return
except Exception as e:
print("getProxy ------> ERROR", e)

def run(self):
for url in self.url:
self.get_proxies_list(url)

项目地址:Python-Spider/爬取代理 at master · Mug-9/Python-Spider (github.com)

---------Thanks for your attention---------