Source code for weblib.content

import re
from copy import deepcopy

from weblib.text import normalize_space, find_number

[docs]def find_content_blocks(tree, min_length=None):
    """
    Iterate over content blocks (russian version)
    """
    from lxml.html import tostring
    from lxml.etree import strip_tags, strip_elements, Comment

    # First, make a copy of DOM-tree to not harm external code
    tree = deepcopy(tree)

    # Completely remove content of following tags
    nondata_tags = ['head', 'style', 'script']
    strip_elements(tree, *nondata_tags)

    # Remove comment nodes (keep tail text)
    strip_tags(tree, Comment)

    # Remove links
    strip_tags(tree, 'a')

    # Drop inline tags
    inline_tags = ('br', 'hr', 'p', 'b', 'i', 'strong', 'em', 'a',
                   'span', 'font')
    strip_tags(tree, *inline_tags)

    # Drop media tags
    media_tags = ('img',)
    strip_tags(tree, *media_tags)

    body = tostring(tree, encoding='utf-8').decode('utf-8')

    # Normalize spaces
    body = normalize_space(body)

    # Remove ALL chars from tags
    re_tag = re.compile(r'<[^>]+>')
    body = re_tag.sub(r'<>', body)

    #with open('/tmp/log.html', 'w') as out:
        #out.write(body.encode('utf-8'))
    #return

    # Find text blocks
    block_rex = re.compile(r'[^<>]+')

    blocks = []
    for match in block_rex.finditer(body):
        block = match.group(0)
        if min_length is None or len(block) >= min_length:
            ratio = _trash_ratio(block)
            if ratio < 0.05:
                words = block.split()
                if not any(len(x) > 50 for x in words):
                    blocks.append(block)
    return blocks


def _trash_ratio(text):
    """
    Return ratio of non-common symbols.
    """

    trash_count = 0
    for char in text:
        if char in list(u'.\'"+-!?()[]{}*+@#$%^&_=|/\\'):
            trash_count += 1
    return trash_count / float(len(text))