python 抓取代理IP并多线程验证可用性
本程序抓取的开源项目(每 15 min 会更新数据)
运行流程:
- 抓取https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list
- 多线程验证可用性
- 将有用的IP信息(IP地址及端口号)进行存储,写入TXT文本
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
import json
import threading
#下载代理文件
def downloadProxylist():
r = requests.get('https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list')
data = r.text.split("\n")
return data
ipText = ''
#验证
def testIP(a):
try:
r = requests.get('http://www.baidu.com/', proxies = a, timeout = 30)
print(a['http'] + ' - ' + str(r.status_code))
if r.status_code == 200:
global ipText
#前后缀设置 默认为 '192.168.1.1:80',
prefix = " '" #前缀 '
postfix = "',\r\n" #后缀 ',\r\n
ipText += prefix + a['http'] + postfix
except:
print(a['http'] + ' - error')
proxys = []
#主程序
def main():
#开始下载代理文件
data = downloadProxylist()
#初始化代理数组
for item in data:
if item != "":
a = json.loads(item)
proxy_temp = {"http": a['host'] + ':' + str(a['port'])}
proxys.append(proxy_temp)
#开始验证
threads=[]
for i in range(len(proxys)):
thread=threading.Thread(target=testIP,args=[proxys[i]])
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
#写入TXT文件
f = open('/home/liumingye/ip.txt','w') #改成你要存储的位置
f.write('proxy = [\r\n'+ipText+']')
f.close()
main()
现在,python很受欢迎啊