Source code for weblib.etree

"""
Functions to process content of lxml nodes.
"""
import re

from weblib.text import normalize_space as normalize_space_func, find_number
from weblib.encoding import smart_str, smart_unicode

from weblib.py3k_support import *

RE_TAG_START = re.compile(r'<[a-z]')


[docs]def get_node_text(node, smart=False, normalize_space=True): """ Extract text content of the `node` and all its descendants. In smart mode `get_node_text` insert spaces between <tag><another tag> and also ignores content of the script and style tags. In non-smart mode this func just return text_content() of node with normalized spaces """ # If xpath return a attribute value, it value will be string not a node if isinstance(node, basestring): if normalize_space: node = normalize_space_func(node) return node if smart: value = ' '.join(node.xpath( './descendant-or-self::*[name() != "script" and '\ 'name() != "style"]/text()[normalize-space()]')) else: # If DOM tree was built with lxml.etree.fromstring # then tree nodes do not have text_content() method try: value = node.text_content() except AttributeError: value = ''.join(node.xpath('.//text()')) if normalize_space: value = normalize_space_func(value) return value
[docs]def find_node_number(node, ignore_spaces=False, make_int=True): """ Find number in text content of the `node`. """ text = get_node_text(node) return find_number(text, ignore_spaces=ignore_spaces, make_int=make_int)
[docs]def truncate_tail(node, xpath): """ Find sub-node by its xpath and remove it and all adjacent nodes following after found node. """ subnode = node.xpath(xpath)[0] for item in subnode.xpath('following-sibling::*'): item.getparent().remove(item) subnode.getparent().remove(subnode)
[docs]def parse_html(html, encoding='utf-8'): """ Parse html into ElementTree node. """ import lxml.html parser = lxml.html.HTMLParser(encoding=encoding) return lxml.html.fromstring(html, parser=parser)
[docs]def render_html(node, encoding=None, make_unicode=None): """ Render Element node. """ import lxml.html if make_unicode is not None: logging.error('Argument `make_unicode` of fuction `render_html` is ' 'deprecated. Used `encoding=None` to get unicode ' 'result.') if make_unicode or encoding is None: return lxml.html.tostring(node, encoding='utf-8').decode('utf-8') else: return lxml.html.tostring(node, encoding=encoding)
def render_node(node): import lxml.html return lxml.html.tostring(node, encoding='utf-8').decode('utf-8')
[docs]def truncate_html(html, limit, encoding='utf-8'): """ Truncate html data to specified length and then fix broken tags. """ if not isinstance(html, unicode): html = html.decode(encoding) truncated_html = html[:limit] elem = parse_html(truncated_html, encoding=encoding) fixed_html = render_html(elem, encoding=encoding) return fixed_html
[docs]def clone_node(elem): """ Create clone of Element node. The resulted clone is not connected ot original DOM tree. """ return parse_html(render_html(elem))
def sanitize_html(html, encoding='utf-8', return_unicode=False): html = smart_str(html, encoding=encoding) if RE_TAG_START.search(html): html = render_html(parse_html(html)) if return_unicode: return html.decode('utf-8') else: return html
[docs]def drop_node(tree, xpath, keep_content=False): """ Find sub-node by its xpath and remove it. """ for node in tree.xpath(xpath): parent = node.getparent() if keep_content: # Find position of node in list of adjacent nodes pos = parent.index(node) + 1 # move all node's childrent to level higher for subnode in node: parent.insert(pos, subnode) pos += 1 # now replace node with its text node_text = (node.text or '') + (node.tail or '') replace_rawnode_with_text(node, node_text) else: replace_rawnode_with_text(node, node.tail or '')
def replace_node_with_text(root, xpath, text): for node in root.xpath(xpath): new_text = (text + node.tail) if node.tail else text replace_rawnode_with_text(node, new_text) def replace_rawnode_with_text(node, text): parent = node.getparent() if parent is not None: previous = node.getprevious() if previous is not None: previous.tail = (previous.tail or '') + text else: parent.text = (parent.text or '') + text parent.remove(node)
[docs]def clean_html(html, safe_attrs=('src', 'href'), input_encoding=None, output_encoding=None, **kwargs): """ Fix HTML structure and remove non-allowed attributes from all tags. """ from lxml.html.clean import Cleaner tree = parse_html(html, encoding=input_encoding) html = render_html(tree) # Strip some trash with default lxml weblib cleaner = Cleaner(page_structure=True, **kwargs) html = cleaner.clean_html(html) # Keep only allowed attributes tree = parse_html(html) for elem in tree.xpath('./descendant-or-self::*'): for key in elem.attrib.keys(): if safe_attrs: if key not in safe_attrs: del elem.attrib[key] return render_html(tree, encoding=output_encoding)