Source code for weblib.rex

import re

from weblib.error import DataNotFound
from weblib.text import normalize_space
from weblib.html import decode_entities
from weblib.py3k_support import *

REGEXP_CACHE = {}
NULL = object()


[docs]def extract_rex_list(rex, body):
    """
    Return found matches.
    """

    return rex.findall(body)


def cache_regexp(rex, flags=0):
    key = (rex, flags)
    try:
        return REGEXP_CACHE[key]
    except KeyError:
        obj = re.compile(rex, flags)
        #obj.source = rex
        REGEXP_CACHE[key] = obj
        return obj


[docs]def rex(body, regexp, flags=0, byte=False, default=NULL):
    """
    Search `regexp` expression in `body` text.
    """

    regexp = normalize_regexp(regexp, flags)
    match =  regexp.search(body)
    if match:
        return match
    else:
        if default is NULL:
            raise DataNotFound('Could not find regexp: %s' % regexp)
        else:
            return default


[docs]def rex_text(body, regexp, flags=0, default=NULL):
    """
    Search `regexp` expression in `body` text and then strip tags in found result.
    """

    match = rex(body, regexp, flags=flags, default=default)
    try:
        return normalize_space(decode_entities(match.group(1)))
    except AttributeError:
        if default is NULL:
            raise DataNotFound('Regexp not found')
        else:
            return default


[docs]def normalize_regexp(regexp, flags=0):
    """
    Accept string or compiled regular expression object.

    Compile string into regular expression object.
    """

    if isinstance(regexp, basestring):
        return cache_regexp(regexp, flags)
    else:
        return regexp


[docs]def rex_list(body, rex, flags=0):
    """
    Return found matches.
    """

    rex = normalize_regexp(rex, flags)
    return list(rex.finditer(body))


[docs]def rex_text_list(body, rex, flags=0):
    """
    Return found matches with stripped tags.
    """

    items = []
    for match in rex_list(body, rex, flags=flags):
        items.append(normalize_space(decode_entities(match.group(1))))
    return items