# -*- coding: utf-8 -*-
# Copyright: 2011, Grigoriy Petukhov
# Author: Grigoriy Petukhov (http://lorien.name)
# License: BSD
import re
try:
from htmlentitydefs import name2codepoint
except ImportError:
from html.entities import name2codepoint
import logging
from weblib.text import normalize_space as normalize_space_func
import six
#from weblib.py3k_support import *
RE_TAG = re.compile(r'<[^>]+>')
RE_REFRESH_TAG = re.compile(r'<meta[^>]+http-equiv\s*=\s*["\']*Refresh[^>]+', re.I)
# <meta http-equiv='REFRESH' content='0;url= http://www.bk55.ru/mc2/news/article/855'>
RE_REFRESH_URL = re.compile(r'''
content \s* = \s*
["\']* \s* \d+ \s*
;?
(?: \s* url \s* = \s*)?
["\']* ([^\'"> ]*)
''', re.I | re.X)
RE_ENTITY = re.compile(r'(&[a-z]+;)')
RE_NUM_ENTITY = re.compile(r'(&#[0-9]+;)')
RE_HEX_ENTITY = re.compile(r'(&#x[a-f0-9]+;)', re.I)
RE_BASE_URL = re.compile(r'<base[^>]+href\s*=["\']*([^\'"> ]+)', re.I)
RE_BR = re.compile(r'<br\s*/?>', re.I)
[docs]def decode_entities(html):
"""
Convert all HTML entities into their unicode
representations.
This functions processes following entities:
* &XXX;
* &#XXX;
Example::
>>> print html.decode_entities('→ABC R©')
→ABC R©
"""
def process_entity(match):
entity = match.group(1)
name = entity[1:-1]
if name in name2codepoint:
return six.unichr(name2codepoint[name])
else:
return entity
def process_num_entity(match):
entity = match.group(1)
num = entity[2:-1]
try:
return six.unichr(int(num))
except ValueError:
return entity
def process_hex_entity(match):
entity = match.group(1)
code = entity[3:-1]
try:
return six.unichr(int(code, 16))
except ValueError:
return entity
html = RE_NUM_ENTITY.sub(process_num_entity, html)
html = RE_HEX_ENTITY.sub(process_hex_entity, html)
html = RE_ENTITY.sub(process_entity, html)
return html
[docs]def find_refresh_url(html):
"""
Find value of redirect url from http-equiv refresh meta tag.
"""
# We should decode quote values to correctly find
# the url value
#html = html.replace(''', '\'')
#html = html.replace('"', '"').replace('"', '"')
html = decode_entities(html)
match = RE_REFRESH_TAG.search(html)
if match:
match = RE_REFRESH_URL.search(match.group(0))
if match:
return match.group(1)
return None
[docs]def find_base_url(html):
"""
Find url of <base> tag.
"""
html = decode_entities(html)
match = RE_BASE_URL.search(html)
if match:
return match.group(1)
else:
return None
def strip_tags(html, normalize_space=True, convert_br=False):
if convert_br:
html = RE_BR.sub('\n', html)
text = RE_TAG.sub(' ', html)
if normalize_space:
return normalize_space_func(text)
else:
return text
[docs]def escape(html):
"""
Returns the given HTML with ampersands, quotes and angle brackets encoded.
"""
return html.replace('&', '&')\
.replace('<', '<')\
.replace('>', '>')\
.replace('"', '"')\
.replace("'", ''')