Source code for weblib.text

"""
Text parsing and processing utilities.
"""
import re

from weblib.error import DataNotFound
#from weblib.py3k_support import *

RE_NUMBER = re.compile(r'\d+')
RE_NUMBER_WITH_SPACES = re.compile(r'\d[\s\d]*', re.U)
RE_SPACE = re.compile(r'\s+', re.U)
BOM_TOKEN = '\xef\xbb\xbf'


[docs]def find_number(text, ignore_spaces=False, make_int=True, ignore_chars=None): """ Find the number in the `text`. :param text: unicode or byte-string text :param ignore_spaces: if True then groups of digits delimited by spaces are considered as one number :raises: :class:`DataNotFound` if number was not found. """ if ignore_chars: for char in ignore_chars: text = text.replace(char, '') if ignore_spaces: match = RE_NUMBER_WITH_SPACES.search(text) else: match = RE_NUMBER.search(text) if match: val = match.group(0) if ignore_spaces: val = drop_space(val) if make_int: val = int(val) return val else: raise DataNotFound
[docs]def drop_space(text): """ Drop all space-chars in the `text`. """ return RE_SPACE.sub('', text)
[docs]def normalize_space(text, replace=' '): """ Replace sequence of space-chars with one space char. Also drop leading and trailing space-chars. """ return RE_SPACE.sub(replace, text.strip()).strip()
[docs]def remove_bom(text): """ Remove BOM-sequence from the start of byte string. """ if isinstance(text, unicode): raise RuntimeConfigError('The function remove_bom accepts only byte strings') if text.startswith(BOM_TOKEN): return text[3:] else: return text
#def strip_space(text): #"""Strip all spaces at begin or end of the text""" #return RE_STRIP_SPACE.sub('', text)