Source code for weblib.content

import re
from copy import deepcopy

from weblib.text import normalize_space, find_number

[docs]def find_content_blocks(tree, min_length=None): """ Iterate over content blocks (russian version) """ from lxml.html import tostring from lxml.etree import strip_tags, strip_elements, Comment # First, make a copy of DOM-tree to not harm external code tree = deepcopy(tree) # Completely remove content of following tags nondata_tags = ['head', 'style', 'script'] strip_elements(tree, *nondata_tags) # Remove comment nodes (keep tail text) strip_tags(tree, Comment) # Remove links strip_tags(tree, 'a') # Drop inline tags inline_tags = ('br', 'hr', 'p', 'b', 'i', 'strong', 'em', 'a', 'span', 'font') strip_tags(tree, *inline_tags) # Drop media tags media_tags = ('img',) strip_tags(tree, *media_tags) body = tostring(tree, encoding='utf-8').decode('utf-8') # Normalize spaces body = normalize_space(body) # Remove ALL chars from tags re_tag = re.compile(r'<[^>]+>') body = re_tag.sub(r'<>', body) #with open('/tmp/log.html', 'w') as out: #out.write(body.encode('utf-8')) #return # Find text blocks block_rex = re.compile(r'[^<>]+') blocks = [] for match in block_rex.finditer(body): block = match.group(0) if min_length is None or len(block) >= min_length: ratio = _trash_ratio(block) if ratio < 0.05: words = block.split() if not any(len(x) > 50 for x in words): blocks.append(block) return blocks
def _trash_ratio(text): """ Return ratio of non-common symbols. """ trash_count = 0 for char in text: if char in list(u'.\'"+-!?()[]{}*+@#$%^&_=|/\\'): trash_count += 1 return trash_count / float(len(text))