How to fetch data from url in Python

Created
Modified

Fetching URLs

The simplest way to use urllib.request is as follows:

#!/usr/bin/env python3

# Import datetime module
import urllib.request

with urllib.request.urlopen('https://google.com') as response:
  html = response.read()

# the same Request
req = urllib.request.Request('https://google.com')
with urllib.request.urlopen(req) as response:
  content = response.read()

POST Data

The data needs to be encoded in a standard way, and then passed to the Request object as the data argument.

#!/usr/bin/env python3

# Import module
import urllib.parse
import urllib.request

# POST
url = 'https://google.com'
param = {'q': 'Python'}
data = urllib.parse.urlencode(param)
# data should be bytes
data = data.encode('ascii')
req = urllib.request.Request(url, data)
with urllib.request.urlopen(req) as response:
  content = response.read()

# HTTP GET request
url = 'https://google.com'
param = {'q': 'Python'}
data = urllib.parse.urlencode(param)
print(data)
full_url = url + '?' + data
req = urllib.request.Request(url)
with urllib.request.urlopen(req) as response:
  content = response.read()
q=Python

Headers

By default urllib identifies itself as Python-urllib/x.y.

#!/usr/bin/env python3

# Import module
import urllib.parse
import urllib.request

# Headers
url = 'https://installmd.com'
param = {'q': 'Python'}
headers = {'User-Agent': 'Mozilla/5.0'}

data = urllib.parse.urlencode(param)
# data should be bytes
data = data.encode('ascii')
req = urllib.request.Request(url, data, headers)
with urllib.request.urlopen(req) as response:
  content = response.read()

Handling Exceptions

urlopen raises URLError when it cannot handle a response.

#!/usr/bin/env python3

# Import module
import urllib.error
import urllib.request

# no host given
req = urllib.request.Request('https:/google.com')

try: 
  with urllib.request.urlopen(req) as response:
    content = response.read()
except urllib.error.URLError as e:
    print(e.reason)
no host given

Error Codes

# Table mapping response codes to messages; entries have the
# form {code: (shortmessage, longmessage)}.
responses = {
  100: ('Continue', 'Request received, please continue'),
  101: ('Switching Protocols',
        'Switching to new protocol; obey Upgrade header'),

  200: ('OK', 'Request fulfilled, document follows'),
  201: ('Created', 'Document created, URL follows'),
  202: ('Accepted',
        'Request accepted, processing continues off-line'),
  203: ('Non-Authoritative Information', 'Request fulfilled from cache'),
  204: ('No Content', 'Request fulfilled, nothing follows'),
  205: ('Reset Content', 'Clear input form for further input.'),
  206: ('Partial Content', 'Partial content follows.'),

  300: ('Multiple Choices',
        'Object has several resources -- see URI list'),
  301: ('Moved Permanently', 'Object moved permanently -- see URI list'),
  302: ('Found', 'Object moved temporarily -- see URI list'),
  303: ('See Other', 'Object moved -- see Method and URL list'),
  304: ('Not Modified',
        'Document has not changed since given time'),
  305: ('Use Proxy',
        'You must use proxy specified in Location to access this '
        'resource.'),
  307: ('Temporary Redirect',
        'Object moved temporarily -- see URI list'),

  400: ('Bad Request',
        'Bad request syntax or unsupported method'),
  401: ('Unauthorized',
        'No permission -- see authorization schemes'),
  402: ('Payment Required',
        'No payment -- see charging schemes'),
  403: ('Forbidden',
        'Request forbidden -- authorization will not help'),
  404: ('Not Found', 'Nothing matches the given URI'),
  405: ('Method Not Allowed',
        'Specified method is invalid for this server.'),
  406: ('Not Acceptable', 'URI not available in preferred format.'),
  407: ('Proxy Authentication Required', 'You must authenticate with '
        'this proxy before proceeding.'),
  408: ('Request Timeout', 'Request timed out; try again later.'),
  409: ('Conflict', 'Request conflict.'),
  410: ('Gone',
        'URI no longer exists and has been permanently removed.'),
  411: ('Length Required', 'Client must specify Content-Length.'),
  412: ('Precondition Failed', 'Precondition in headers is false.'),
  413: ('Request Entity Too Large', 'Entity is too large.'),
  414: ('Request-URI Too Long', 'URI is too long.'),
  415: ('Unsupported Media Type', 'Entity body in unsupported format.'),
  416: ('Requested Range Not Satisfiable',
        'Cannot satisfy request range.'),
  417: ('Expectation Failed',
        'Expect condition could not be satisfied.'),

  500: ('Internal Server Error', 'Server got itself in trouble'),
  501: ('Not Implemented',
        'Server does not support this operation'),
  502: ('Bad Gateway', 'Invalid responses from another server/proxy.'),
  503: ('Service Unavailable',
        'The server cannot process the request due to a high load'),
  504: ('Gateway Timeout',
        'The gateway server did not receive a timely response'),
  505: ('HTTP Version Not Supported', 'Cannot fulfill request.'),
}

Related Tags

#fetch# #url# #urllib#