爬取西刺代理
Posted On 2017-08-16
西刺代理:http://www.xicidaili.com/
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
#!/env/bin/python #coding:utf-8 import re import time import json import requests def get_xici(): headers = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding":"gzip, deflate", "Accept-Language":"zh-CN,zh;q=0.8", "Cache-Control":"max-age=0", "Connection":"keep-alive", "Host":"www.xicidaili.com", "If-None-Match":'W/"ea22ce162024053f625917bcec3411ea"', "Referer":"https://www.baidu.com/link?url=van14UVKfsx0R7IGpr7UpHUhcTpRrG_y04zqbIbqw-6pBhVmpnMt1UJMMHcQ89E7&wd=&eqid=b1669ff300050f08000000065993eee2", "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36" } url = "http://www.xicidaili.com/" proxy_list = [] r = requests.get('http://www.xicidaili.com/',headers = headers) html = r.text table = re.findall(r'<table id="ip_list">[\w\W]*?</table>', html)[0] for tr in re.findall(r"<tr.*?>[\w\W]*?</tr>", table): tds = re.findall(r"<td.*?>([\w\W]*?)</td>", tr) if len(tds) == 8: if tds[5].lower() in ["http", "https"]: #temp = {} proxy_list.append({tds[5].lower(): "%s:%s"%(tds[1], tds[2])}) return proxy_list |