爬取西刺代理

Posted On 2017-08-16

西刺代理：http://www.xicidaili.com/

#!/env/bin/python
#coding:utf-8

import re
import time
import json
import requests
def get_xici():
    headers = {
        "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding":"gzip, deflate",
        "Accept-Language":"zh-CN,zh;q=0.8",
        "Cache-Control":"max-age=0",
        "Connection":"keep-alive",
        "Host":"www.xicidaili.com",
        "If-None-Match":'W/"ea22ce162024053f625917bcec3411ea"',
        "Referer":"https://www.baidu.com/link?url=van14UVKfsx0R7IGpr7UpHUhcTpRrG_y04zqbIbqw-6pBhVmpnMt1UJMMHcQ89E7&wd=&eqid=b1669ff300050f08000000065993eee2",
        "Upgrade-Insecure-Requests":"1",
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
    }
    url = "http://www.xicidaili.com/"
    proxy_list = []
    r = requests.get('http://www.xicidaili.com/',headers = headers)
    html = r.text
    table = re.findall(r'<table id="ip_list">[\w\W]*?</table>', html)[0]
    for tr in re.findall(r"<tr.*?>[\w\W]*?</tr>", table):
        tds = re.findall(r"<td.*?>([\w\W]*?)</td>", tr)
        if len(tds) == 8:
            if tds[5].lower() in ["http", "https"]:
                #temp = {}
                proxy_list.append({tds[5].lower(): "%s:%s"%(tds[1], tds[2])})
    return proxy_list

#!/env/bin/python

#coding:utf-8

import re

import time

import json

import requests

def get_xici():

headers = {

"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",

"Accept-Encoding":"gzip, deflate",

"Accept-Language":"zh-CN,zh;q=0.8",

"Cache-Control":"max-age=0",

"Connection":"keep-alive",

"Host":"www.xicidaili.com",

"If-None-Match":'W/"ea22ce162024053f625917bcec3411ea"',

"Referer":"https://www.baidu.com/link?url=van14UVKfsx0R7IGpr7UpHUhcTpRrG_y04zqbIbqw-6pBhVmpnMt1UJMMHcQ89E7&wd=&eqid=b1669ff300050f08000000065993eee2",

"Upgrade-Insecure-Requests":"1",

"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"

}

url = "http://www.xicidaili.com/"

proxy_list = []

r = requests.get('http://www.xicidaili.com/',headers = headers)

html = r.text

table = re.findall(r'<table id="ip_list">[\w\W]*?</table>', html)[0]

for tr in re.findall(r"<tr.*?>[\w\W]*?</tr>", table):

tds = re.findall(r"<td.*?>([\w\W]*?)</td>", tr)

if len(tds) == 8:

if tds[5].lower() in ["http", "https"]:

#temp = {}

proxy_list.append({tds[5].lower(): "%s:%s"%(tds[1], tds[2])})

return proxy_list

Add a Comment

要发表评论，您必须先登录。

一	二	三	四	五	六	日
« 9月
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30