Source code for weblib.http

try:
    import urllib.parse as urllib
except ImportError:
    import urllib
from six.moves.urllib.parse import urlsplit, urlunsplit, quote, unquote
import re
import logging
import six

from weblib.encoding import make_str, make_unicode, decode_pairs

from weblib.py3k_support import *

# I do not know, what the hell is going on, but sometimes
# when IDN url should be requested grab fails with error
# LookupError: unknown encoding: punycode
# That happens in grab/base.py near by 347 line on the line::
# kwargs['url'] = normalize_url(kwargs['url'])
# If you try to catch the error with except and import pdb; pdb.set_trace()
# then you'll get "no pdb module" error. WTF??
# But if you import pdb at the top of the module then you can use it
# So.... I import here this module and I hope that will helps
# My idea is that some mystical shit does some thing that breaks python
# environment,, breaks sys.path So, when special case occurs and some new module
# is need to be imported then that can't be done due to the unknown magical influence
import encodings.punycode

# From RFC 3986
# reserved    = gen-delims / sub-delims
# gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
# sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
#               / "*" / "+" / "," / ";" / "="
# unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
GEN_DELIMS = r':/?#[]@'
SUB_DELIMS = r'!$&\'()*+,;='
RESERVED_CHARS = GEN_DELIMS + SUB_DELIMS + '%'
UNRESERVED_CHARS = r'-._~a-zA-Z0-9'
RE_NOT_SAFE_CHAR = re.compile(r'[^' +
                              UNRESERVED_CHARS + 
                              re.escape(RESERVED_CHARS) + ']')
RE_NON_ALPHA_DIGIT_NETLOC = re.compile(r'[^-.:@a-zA-Z0-9]')
logger = logging.getLogger('weblib.http')


def urlencode(*args, **kwargs):
    logger.debug('Method weblib.http.urlencode is deprecated. '
                 'Please use weblib.http.smart_urlencode')
    return smart_urlencode(*args, **kwargs)


[docs]def smart_urlencode(items, charset='utf-8'):
    """
    Convert sequence of items into bytestring which could be submitted
    in POST or GET request.

    It differs from ``urllib.urlencode`` in that it can process unicode
    and some special values.

    ``items`` could dict or tuple or list.
    """

    if isinstance(items, dict):
        items = items.items()
    return urllib.urlencode(normalize_http_values(items, charset=charset))


[docs]def encode_cookies(items, join=True, charset='utf-8'):
    """
    Serialize dict or sequence of two-element items into string suitable
    for sending in Cookie http header.
    """

    def encode(val):
        """
        URL-encode special characters in the text.

        In cookie value only ",", " ", "\t" and ";" should be encoded
        """

        return val.replace(b' ', b'%20').replace(b'\t', b'%09')\
                  .replace(b';', b'%3B').replace(b',', b'%2C')

    if isinstance(items, dict):
        items = items.items()
    items = normalize_http_values(items, charset=charset)

    # py3 hack
    #if PY3K:
    #    items = decode_pairs(items, charset)

    tokens = []
    for key, value in items:
        tokens.append(b'='.join((encode(key), encode(value))))
    if join:
        return b'; '.join(tokens)
    else:
        return tokens


[docs]def normalize_http_values(items, charset='utf-8', ignore_classes=None):
    """
    Accept sequence of (key, value) paris or dict and convert each
    value into bytestring.

    Unicode is converted into bytestring using charset of previous response
    (or utf-8, if no requests were performed)

    None is converted into empty string. 

    If `ignore_classes` is not None and the value is instance of
    any classes from the `ignore_classes` then the value is not
    processed and returned as-is.
    """

    if isinstance(items, dict):
        items = items.items()

    def process(item):
        key, value = item

        if ignore_classes and isinstance(value, ignore_classes):
            pass
        elif isinstance(value, unicode):
            value = normalize_unicode(value, charset=charset)
        elif value is None:
            value = ''
        else:
            value = str(value)

        if isinstance(key, unicode):
            key = normalize_unicode(key, charset=charset)

        return key, value

    items =  list(map(process, items))
    #items = sorted(items, key=lambda x: x[0])
    return items


[docs]def normalize_unicode(value, charset='utf-8'):
    """
    Convert unicode into byte-string using detected charset (default or from
    previous response)

    By default, charset from previous response is used to encode unicode into
    byte-string but you can enforce charset with ``charset`` option
    """

    if not isinstance(value, unicode):
        return value
    else:
        return value.encode(charset, 'ignore')


def normalize_url(url):
    # https://tools.ietf.org/html/rfc3986
    url = make_unicode(url)
    if RE_NOT_SAFE_CHAR.search(url):
        parts = list(urlsplit(url))
        # Scheme
        pass
        # Network location (user:pass@hostname)
        if RE_NON_ALPHA_DIGIT_NETLOC.search(parts[1]):
            parts[1] = parts[1].encode('idna')
        # Path
        parts[2] = quote(make_str(parts[2]), safe=RESERVED_CHARS)
        # Query
        parts[3] = quote(make_str(parts[3]), safe=RESERVED_CHARS)
        # Fragment
        parts[4] = quote(make_str(parts[4]), safe=RESERVED_CHARS)
        return urlunsplit(map(make_unicode, parts))
    return url


def normalize_post_data(data, charset):
    if isinstance(data, (six.binary_type, six.text_type)):
        # bytes-string should be posted as-is
        # unicode should be converted into byte-string
        if isinstance(data, unicode):
            return normalize_unicode(data, charset)
        else:
            return data
    else:
        # dict, tuple, list should be serialized into byte-string
        return smart_urlencode(data, charset)