urllib提供了一系列用于操作URL的功能,而urllib的request模块可以非常方便地抓取URL内容,也就是发送一个GET请求到指定的页面,然后返回HTTP的响应。
基本的网络请求示例
import urllib.request #请求百度网页 resu = urllib.request.urlopen('https://www.baidu.com', data = None, timeout = 10) print(resu.read(300)) #指定编码请求 with urllib.request.urlopen('https://www.baidu.com') as resu: print(resu.read(300).decode('GBK')) #指定编码请求 f = urllib.request.urlopen('https://www.baidu.com') print(f.read(100).decode('utf-8'))
发送数据请求,CGI程序处理
import urllib.request req = urllib.request.Request(url='http://localhost/cgi-bin/test.cgi', data=b'This data is passed to stdin of the CGI') f = urllib.request.urlopen(req) print(f.read().decode('utf-8'))
PUT请求
import urllib.request DATA=b'some data' req = urllib.request.Request(url='http://localhost:8080', data=DATA,method='PUT') f = urllib.request.urlopen(req) print(f.status) print(f.reason)
基本的HTTP验证,登录请求
import urllib.request # Create an OpenerDirector with support for Basic HTTP Authentication... auth_handler = urllib.request.HTTPBasicAuthHandler() auth_handler.add_password(realm='Application', uri='http://xxx/test.py', user='test', passwd='test') opener = urllib.request.build_opener(auth_handler) # ...and install it globally so it can be used with urlopen. urllib.request.install_opener(opener) urllib.request.urlopen('http://www.example.com/login.html')
支持代理方式验证请求
proxy_handler = urllib.request.ProxyHandler({'http': 'http://www.example.com/'}) proxy_auth_handler = urllib.request.ProxyBasicAuthHandler() proxy_auth_handler.add_password('aaa', 'bbb', 'username', 'password') opener = urllib.request.build_opener(proxy_handler, proxy_auth_handler) # This time, rather than install the OpenerDirector, we use it directly: opener.open('http://www.example.com/login.html')
添加 http headers
import urllib.request req = urllib.request.Request('http://www.example.com/') req.add_header('Referer', 'http://www.python.org/') r = urllib.request.urlopen(req)
添加 user-agent
import urllib.request opener = urllib.request.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] opener.open('http://www.example.com/')
带参数的GET 请求
import urllib.request import urllib.parse params = urllib.parse.urlencode({'a': 1, 'b': 2, 'c': 0}) f = urllib.request.urlopen("http://xxx/query?%s" % params) print(f.read().decode('utf-8'))
带参数的POST请求
import urllib.request import urllib.parse data = urllib.parse.urlencode({'a': 1, 'b': 2, 'c': 0}) data = data.encode('utf-8') request = urllib.request.Request("http://xxx") # adding charset parameter to the Content-Type header. request.add_header("Content-Type","application/x-www-form-urlencoded;charset=utf-8") f = urllib.request.urlopen(request, data) print(f.read().decode('utf-8'))
指定代理方式请求
import urllib.request proxies = {'http': 'http://proxy.example.com:8080/'} opener = urllib.request.FancyURLopener(proxies) f = opener.open("http://www.python.org") f.read().decode('utf-8')
无添加代理
import urllib.request opener = urllib.request.FancyURLopener({}) f = opener.open("http://www.python.org/") f.read().decode('utf-8')