Source code for weblib.etree

"""
Functions to process content of lxml nodes.
"""
import re

from weblib.text import normalize_space as normalize_space_func, find_number
from weblib.encoding import smart_str, smart_unicode

from weblib.py3k_support import *

RE_TAG_START = re.compile(r'<[a-z]')


[docs]def get_node_text(node, smart=False, normalize_space=True):
    """
    Extract text content of the `node` and all its descendants.

    In smart mode `get_node_text` insert spaces between <tag><another tag>
    and also ignores content of the script and style tags.

    In non-smart mode this func just return text_content() of node
    with normalized spaces
    """

    # If xpath return a attribute value, it value will be string not a node
    if isinstance(node, basestring):
        if normalize_space:
            node = normalize_space_func(node)
        return node

    if smart:
        value = ' '.join(node.xpath(
            './descendant-or-self::*[name() != "script" and '\
            'name() != "style"]/text()[normalize-space()]'))
    else:
        # If DOM tree was built with lxml.etree.fromstring
        # then tree nodes do not have text_content() method
        try:
            value = node.text_content()
        except AttributeError:
            value = ''.join(node.xpath('.//text()'))
    if normalize_space:
        value = normalize_space_func(value)
    return value


[docs]def find_node_number(node, ignore_spaces=False, make_int=True):
    """
    Find number in text content of the `node`.
    """

    text = get_node_text(node)
    return find_number(text, ignore_spaces=ignore_spaces, make_int=make_int)


[docs]def truncate_tail(node, xpath):
    """
    Find sub-node by its xpath and remove it and all adjacent nodes following
    after found node.
    """

    subnode = node.xpath(xpath)[0]
    for item in subnode.xpath('following-sibling::*'):
        item.getparent().remove(item)
    subnode.getparent().remove(subnode)


[docs]def parse_html(html, encoding='utf-8'):
    """
    Parse html into ElementTree node.
    """
    import lxml.html

    parser = lxml.html.HTMLParser(encoding=encoding)
    return lxml.html.fromstring(html, parser=parser)


[docs]def render_html(node, encoding=None, make_unicode=None):
    """
    Render Element node.
    """
    import lxml.html

    if make_unicode is not None:
        logging.error('Argument `make_unicode` of fuction `render_html` is '
                      'deprecated. Used `encoding=None` to get unicode '
                      'result.')
    if make_unicode or encoding is None:
        return lxml.html.tostring(node, encoding='utf-8').decode('utf-8')
    else:
        return lxml.html.tostring(node, encoding=encoding)


def render_node(node):
    import lxml.html

    return lxml.html.tostring(node, encoding='utf-8').decode('utf-8')


[docs]def truncate_html(html, limit, encoding='utf-8'):
    """
    Truncate html data to specified length and then fix broken tags.
    """

    if not isinstance(html, unicode):
        html = html.decode(encoding)
    truncated_html = html[:limit]
    elem = parse_html(truncated_html, encoding=encoding)
    fixed_html = render_html(elem, encoding=encoding)
    return fixed_html


[docs]def clone_node(elem):
    """
    Create clone of Element node.

    The resulted clone is not connected ot original DOM tree.
    """

    return parse_html(render_html(elem))


[docs]def disable_links(elem):
    """
    Replace all links with span tags and drop href atrributes.
    """

    for node in elem.xpath('.//a'):
        node.tag = 'span'
        if 'href' in node.attrib:
            del node.attrib['href']


def sanitize_html(html, encoding='utf-8', return_unicode=False):
    html = smart_str(html, encoding=encoding)
    if RE_TAG_START.search(html):
        html = render_html(parse_html(html))
    if return_unicode:
        return html.decode('utf-8')
    else:
        return html


[docs]def drop_node(tree, xpath, keep_content=False):
    """
    Find sub-node by its xpath and remove it.
    """

    for node in tree.xpath(xpath):
        parent = node.getparent()
        if keep_content:
            # Find position of node in list of adjacent nodes
            pos = parent.index(node) + 1
            # move all node's childrent to level higher
            for subnode in node:
                parent.insert(pos, subnode)
                pos += 1
            # now replace node with its text
            node_text = (node.text or '') + (node.tail or '')
            replace_rawnode_with_text(node, node_text)
        else:
            replace_rawnode_with_text(node, node.tail or '')


def replace_node_with_text(root, xpath, text):
    for node in root.xpath(xpath):
        new_text = (text + node.tail) if node.tail else text
        replace_rawnode_with_text(node, new_text)


def replace_rawnode_with_text(node, text):
    parent = node.getparent()
    if parent is not None:
        previous = node.getprevious()
        if previous is not None:
            previous.tail = (previous.tail or '') + text
        else:
            parent.text = (parent.text or '') + text
        parent.remove(node)


[docs]def clean_html(html, safe_attrs=('src', 'href'),
               input_encoding=None,
               output_encoding=None,
               **kwargs):
    """
    Fix HTML structure and remove non-allowed attributes from all tags.
    """

    from lxml.html.clean import Cleaner

    tree = parse_html(html, encoding=input_encoding)
    html = render_html(tree)

    # Strip some trash with default lxml weblib
    cleaner = Cleaner(page_structure=True, **kwargs)
    html = cleaner.clean_html(html)

    # Keep only allowed attributes
    tree = parse_html(html)
    for elem in tree.xpath('./descendant-or-self::*'):
        for key in elem.attrib.keys():
            if safe_attrs:
                if key not in safe_attrs:
                    del elem.attrib[key]

    return render_html(tree, encoding=output_encoding)