Source code for weblib.html

# -*- coding: utf-8 -*-
# Copyright: 2011, Grigoriy Petukhov
# Author: Grigoriy Petukhov (http://lorien.name)
# License: BSD
import re
try:
    from htmlentitydefs import name2codepoint
except ImportError:
    from html.entities import name2codepoint
import logging

from weblib.text import normalize_space as normalize_space_func
import six
#from weblib.py3k_support import *

RE_TAG = re.compile(r'<[^>]+>')
RE_REFRESH_TAG = re.compile(r'<meta[^>]+http-equiv\s*=\s*["\']*Refresh[^>]+', re.I)
# <meta http-equiv='REFRESH' content='0;url= http://www.bk55.ru/mc2/news/article/855'>
RE_REFRESH_URL = re.compile(r'''
    content \s* = \s*
    ["\']* \s* \d+ \s*
    ;?
    (?: \s* url \s* = \s*)?
    ["\']* ([^\'"> ]*)
''', re.I | re.X)

RE_ENTITY = re.compile(r'(&[a-z]+;)')
RE_NUM_ENTITY = re.compile(r'(&#[0-9]+;)')
RE_HEX_ENTITY = re.compile(r'(&#x[a-f0-9]+;)', re.I)
RE_BASE_URL = re.compile(r'<base[^>]+href\s*=["\']*([^\'"> ]+)', re.I)
RE_BR = re.compile(r'<br\s*/?>', re.I)


[docs]def decode_entities(html):
    """
    Convert all HTML entities into their unicode
    representations.

    This functions processes following entities:
     * &XXX;
     * &#XXX;

    Example::

        >>> print html.decode_entities('&rarr;ABC&nbsp;&#82;&copy;')
        →ABC R©
    """

    def process_entity(match):
        entity = match.group(1)
        name = entity[1:-1]
        if name in name2codepoint:
            return six.unichr(name2codepoint[name])
        else:
            return entity

    def process_num_entity(match):
        entity = match.group(1)
        num = entity[2:-1]
        try:
            return six.unichr(int(num))
        except ValueError:
            return entity

    def process_hex_entity(match):
        entity = match.group(1)
        code = entity[3:-1]
        try:
            return six.unichr(int(code, 16))
        except ValueError:
            return entity

    html = RE_NUM_ENTITY.sub(process_num_entity, html)
    html = RE_HEX_ENTITY.sub(process_hex_entity, html)
    html = RE_ENTITY.sub(process_entity, html)
    return html


[docs]def find_refresh_url(html):
    """
    Find value of redirect url from http-equiv refresh meta tag.
    """

    # We should decode quote values to correctly find
    # the url value
    #html = html.replace('&#39;', '\'')
    #html = html.replace('&#34;', '"').replace('&quot;', '"')
    html = decode_entities(html)

    match = RE_REFRESH_TAG.search(html)
    if match:
        match = RE_REFRESH_URL.search(match.group(0))
        if match:
            return match.group(1)
    return None


[docs]def find_base_url(html):
    """
    Find url of <base> tag.
    """

    html = decode_entities(html)

    match = RE_BASE_URL.search(html)
    if match:
        return match.group(1)
    else:
        return None


def strip_tags(html, normalize_space=True, convert_br=False):
    if convert_br:
        html = RE_BR.sub('\n', html)
    text = RE_TAG.sub(' ', html)
    if normalize_space:
        return normalize_space_func(text)
    else:
        return text


[docs]def escape(html):
    """
    Returns the given HTML with ampersands, quotes and angle brackets encoded.
    """

    return html.replace('&', '&amp;')\
               .replace('<', '&lt;')\
               .replace('>', '&gt;')\
               .replace('"', '&quot;')\
               .replace("'", '&#39;')