import re
from weblib.error import DataNotFound
from weblib.text import normalize_space
from weblib.html import decode_entities
from weblib.py3k_support import *
REGEXP_CACHE = {}
NULL = object()
def cache_regexp(rex, flags=0):
key = (rex, flags)
try:
return REGEXP_CACHE[key]
except KeyError:
obj = re.compile(rex, flags)
#obj.source = rex
REGEXP_CACHE[key] = obj
return obj
[docs]def rex(body, regexp, flags=0, byte=False, default=NULL):
"""
Search `regexp` expression in `body` text.
"""
regexp = normalize_regexp(regexp, flags)
match = regexp.search(body)
if match:
return match
else:
if default is NULL:
raise DataNotFound('Could not find regexp: %s' % regexp)
else:
return default
[docs]def rex_text(body, regexp, flags=0, default=NULL):
"""
Search `regexp` expression in `body` text and then strip tags in found result.
"""
match = rex(body, regexp, flags=flags, default=default)
try:
return normalize_space(decode_entities(match.group(1)))
except AttributeError:
if default is NULL:
raise DataNotFound('Regexp not found')
else:
return default
[docs]def normalize_regexp(regexp, flags=0):
"""
Accept string or compiled regular expression object.
Compile string into regular expression object.
"""
if isinstance(regexp, basestring):
return cache_regexp(regexp, flags)
else:
return regexp
[docs]def rex_list(body, rex, flags=0):
"""
Return found matches.
"""
rex = normalize_regexp(rex, flags)
return list(rex.finditer(body))
[docs]def rex_text_list(body, rex, flags=0):
"""
Return found matches with stripped tags.
"""
items = []
for match in rex_list(body, rex, flags=flags):
items.append(normalize_space(decode_entities(match.group(1))))
return items