How to fetch data from url in Python
Created
Modified
Fetching URLs
The simplest way to use urllib.request is as follows:
#!/usr/bin/env python3
# Import datetime module
import urllib.request
with urllib.request.urlopen('https://google.com') as response:
html = response.read()
# the same Request
req = urllib.request.Request('https://google.com')
with urllib.request.urlopen(req) as response:
content = response.read()
POST Data
The data needs to be encoded in a standard way, and then passed to the Request object as the data argument.
#!/usr/bin/env python3
# Import module
import urllib.parse
import urllib.request
# POST
url = 'https://google.com'
param = {'q': 'Python'}
data = urllib.parse.urlencode(param)
# data should be bytes
data = data.encode('ascii')
req = urllib.request.Request(url, data)
with urllib.request.urlopen(req) as response:
content = response.read()
# HTTP GET request
url = 'https://google.com'
param = {'q': 'Python'}
data = urllib.parse.urlencode(param)
print(data)
full_url = url + '?' + data
req = urllib.request.Request(url)
with urllib.request.urlopen(req) as response:
content = response.read()
q=Python
Headers
By default urllib identifies itself as Python-urllib/x.y.
#!/usr/bin/env python3
# Import module
import urllib.parse
import urllib.request
# Headers
url = 'https://installmd.com'
param = {'q': 'Python'}
headers = {'User-Agent': 'Mozilla/5.0'}
data = urllib.parse.urlencode(param)
# data should be bytes
data = data.encode('ascii')
req = urllib.request.Request(url, data, headers)
with urllib.request.urlopen(req) as response:
content = response.read()
Handling Exceptions
urlopen raises URLError when it cannot handle a response.
#!/usr/bin/env python3
# Import module
import urllib.error
import urllib.request
# no host given
req = urllib.request.Request('https:/google.com')
try:
with urllib.request.urlopen(req) as response:
content = response.read()
except urllib.error.URLError as e:
print(e.reason)
no host given
Error Codes
# Table mapping response codes to messages; entries have the
# form {code: (shortmessage, longmessage)}.
responses = {
100: ('Continue', 'Request received, please continue'),
101: ('Switching Protocols',
'Switching to new protocol; obey Upgrade header'),
200: ('OK', 'Request fulfilled, document follows'),
201: ('Created', 'Document created, URL follows'),
202: ('Accepted',
'Request accepted, processing continues off-line'),
203: ('Non-Authoritative Information', 'Request fulfilled from cache'),
204: ('No Content', 'Request fulfilled, nothing follows'),
205: ('Reset Content', 'Clear input form for further input.'),
206: ('Partial Content', 'Partial content follows.'),
300: ('Multiple Choices',
'Object has several resources -- see URI list'),
301: ('Moved Permanently', 'Object moved permanently -- see URI list'),
302: ('Found', 'Object moved temporarily -- see URI list'),
303: ('See Other', 'Object moved -- see Method and URL list'),
304: ('Not Modified',
'Document has not changed since given time'),
305: ('Use Proxy',
'You must use proxy specified in Location to access this '
'resource.'),
307: ('Temporary Redirect',
'Object moved temporarily -- see URI list'),
400: ('Bad Request',
'Bad request syntax or unsupported method'),
401: ('Unauthorized',
'No permission -- see authorization schemes'),
402: ('Payment Required',
'No payment -- see charging schemes'),
403: ('Forbidden',
'Request forbidden -- authorization will not help'),
404: ('Not Found', 'Nothing matches the given URI'),
405: ('Method Not Allowed',
'Specified method is invalid for this server.'),
406: ('Not Acceptable', 'URI not available in preferred format.'),
407: ('Proxy Authentication Required', 'You must authenticate with '
'this proxy before proceeding.'),
408: ('Request Timeout', 'Request timed out; try again later.'),
409: ('Conflict', 'Request conflict.'),
410: ('Gone',
'URI no longer exists and has been permanently removed.'),
411: ('Length Required', 'Client must specify Content-Length.'),
412: ('Precondition Failed', 'Precondition in headers is false.'),
413: ('Request Entity Too Large', 'Entity is too large.'),
414: ('Request-URI Too Long', 'URI is too long.'),
415: ('Unsupported Media Type', 'Entity body in unsupported format.'),
416: ('Requested Range Not Satisfiable',
'Cannot satisfy request range.'),
417: ('Expectation Failed',
'Expect condition could not be satisfied.'),
500: ('Internal Server Error', 'Server got itself in trouble'),
501: ('Not Implemented',
'Server does not support this operation'),
502: ('Bad Gateway', 'Invalid responses from another server/proxy.'),
503: ('Service Unavailable',
'The server cannot process the request due to a high load'),
504: ('Gateway Timeout',
'The gateway server did not receive a timely response'),
505: ('HTTP Version Not Supported', 'Cannot fulfill request.'),
}