Shortcuts

Source code for TexSoup.utils

import bisect
import functools


##############
# Decorators #
##############


def to_buffer(f):
    """
    Decorator converting all strings and iterators/iterables into Buffers.
    """
    @functools.wraps(f)
    def wrap(*args, **kwargs):
        iterator = kwargs.get('iterator', args[0])
        if not isinstance(iterator, Buffer):
            iterator = Buffer(iterator)
        return f(iterator, *args[1:], **kwargs)
    return wrap

#########################
# Generalized Utilities #
#########################


[docs]class TokenWithPosition(str): """Enhanced string object with knowledge of global position.""" # noinspection PyArgumentList def __new__(cls, text, position=None): """Initializer for pseudo-string object. :param text: The original string :param position: Position in the original buffer """ self = str.__new__(cls, text) if isinstance(text, TokenWithPosition): self.text, self.position = text.text, text.position else: self.text = text self.position = position return self def __repr__(self): return repr(self.text) def __str__(self): return str(self.text) def __getattr__(self, name): return getattr(self.text, name) def __eq__(self, other): """ >>> TokenWithPosition('asdf', 0) == TokenWithPosition('asdf', 2) True >>> TokenWithPosition('asdf', 0) == TokenWithPosition('asd', 0) False """ if isinstance(other, TokenWithPosition): return self.text == other.text else: return self.text == other def __hash__(self): return hash(self.text) def __add__(self, other): """Implements addition in the form of TextWithPosition(...) + (obj). >>> t1 = TokenWithPosition('as', 0) + TokenWithPosition('df', 1) >>> str(t1) 'asdf' >>> t1.position 0 >>> t2 = TokenWithPosition('as', 1) + 'df' >>> str(t2) 'asdf' >>> t3 = TokenWithPosition(t2) >>> t3.position 1 """ if isinstance(other, TokenWithPosition): return TokenWithPosition(self.text + other.text, self.position) else: return TokenWithPosition(self.text + other, self.position) def __radd__(self, other): """Implements addition in the form of (obj) + TextWithPosition(...). Note that if the first element is TokenWithPosition, TokenWithPosition(...).__add__(...) will be used. As a result, we can assume WLOG that `other` is a type other than TokenWithPosition. >>> t1 = TokenWithPosition('as', 2) + TokenWithPosition('dfg', 2) >>> str(t1) 'asdfg' >>> t1.position 2 >>> t2 = 'as' + TokenWithPosition('dfg', 2) >>> str(t2) 'asdfg' >>> t2.position 0 """ return TokenWithPosition(other + self.text, self.position - len(other)) def __iadd__(self, other): """Implements addition in the form of TextWithPosition(...) += ... >>> t1 = TokenWithPosition('as', 0) >>> t1 += 'df' >>> str(t1) 'asdf' >>> t1.position 0 """ if isinstance(other, TokenWithPosition): new = TokenWithPosition(self.text + other.text, self.position) else: new = TokenWithPosition(self.text + other, self.position) return new @classmethod def join(cls, tokens, glue=''): if len(tokens) > 0: return TokenWithPosition(glue.join(t.text for t in tokens), tokens[0].position) else: return '' def __bool__(self): return bool(self.text) def __contains__(self, item): """ >>> 'rg' in TokenWithPosition('corgi', 0) True >>> 'reg' in TokenWithPosition('corgi', 0) False >>> TokenWithPosition('rg', 0) in TokenWithPosition('corgi', 0) True """ if isinstance(item, TokenWithPosition): return item.text in self.text return item in self.text def __iter__(self): """ >>> list(TokenWithPosition('asdf', 0)) ['a', 's', 'd', 'f'] """ return iter(self.__iter()) def __iter(self): for i, c in enumerate(self.text): yield TokenWithPosition(c, self.position + i) def __getitem__(self, i): """Access characters in object just as with strings. >>> t1 = TokenWithPosition('asdf', 2) >>> t1[0] 'a' >>> t1[-1] 'f' >>> t1[:] 'asdf' """ if isinstance(i, int): start = i else: start = i.start if start is None: start = 0 if start < 0: start = len(self.text) + start return TokenWithPosition(self.text[i], self.position + start) def split(self, sep=None, maxsplit=-1): result = [] split_res = self.text.split(sep=sep, maxsplit=maxsplit) txt = self.text cur_offset = 0 for s in split_res: cur_offset = txt.find(s, cur_offset) result.append(TokenWithPosition(s, self.position + cur_offset)) return result def strip(self, *args, **kwargs): stripped = self.text.strip(*args, **kwargs) offset = self.text.find(stripped) return TokenWithPosition(stripped, self.position + offset)
[docs] def lstrip(self, *args, **kwargs): """Strip leading whitespace for text. >>> t = TokenWithPosition(' asdf ', 2) >>> t.lstrip() 'asdf ' """ stripped = self.text.lstrip(*args, **kwargs) offset = self.text.find(stripped) return TokenWithPosition(stripped, self.position + offset)
[docs] def rstrip(self, *args, **kwargs): """Strip trailing whitespace for text. >>> t = TokenWithPosition(' asdf ', 2) >>> t.rstrip() ' asdf' """ stripped = self.text.rstrip(*args, **kwargs) offset = self.text.find(stripped) return TokenWithPosition(stripped, self.position + offset)
# General Buffer class class Buffer: """Converts string or iterable into a navigable iterator of strings >>> b1 = Buffer("012345") >>> next(b1) '0' >>> b1.forward() '1' >>> b1.endswith('1') True >>> b1.backward(2) '01' >>> b1.peek() '0' >>> b1.peek(2) '2' >>> b1.peek((0, 2)) '01' >>> b1.startswith('01') True >>> b1[2:4] '23' >>> Buffer('asdf')[:10] 'asdf' """ def __init__(self, iterator, join=TokenWithPosition.join): """Initialization for Buffer :param iterator: iterator or iterable :param func join: function to join multiple buffer elements """ assert hasattr(iterator, '__iter__'), 'Must be an iterable.' self.__iterator = iter(iterator) self.__queue = [] self.__i = 0 self.__join = join # noinspection PyPep8Naming def hasNext(self): """Returns whether or not there is another element.""" return bool(self.peek()) def startswith(self, s): """ Check if iterator starts with s, beginning from the current position """ return self.peek((0, len(s))).startswith(s) def endswith(self, s): """ Check if iterator ends with s, ending at current position """ return self.peek((-len(s), 0)).endswith(s) def forward(self, j=1): """Move forward by j steps. >>> b = Buffer('abcdef') >>> b.forward(3) 'abc' >>> b.forward(-2) 'bc' """ if j < 0: return self.backward(-j) self.__i += j return self[self.__i-j:self.__i] def num_forward_until(self, condition): """Forward until one of the provided matches is found. :param condition: set of valid strings """ i, c = 0, '' while self.hasNext() and not condition(self.peek()): c += self.forward(1) i += 1 assert self.backward(i) == c return i def forward_until(self, condition): """Forward until one of the provided matches is found. The returned string contains all characters found *before the condition was met. In other words, the condition will be true for the remainder of the buffer. :param condition: set of valid strings """ c = TokenWithPosition('', self.peek().position) while self.hasNext() and not condition(self.peek()): c += self.forward(1) return c def backward(self, j=1): """Move backward by j steps. >>> b = Buffer('abcdef') >>> b.backward(-3) 'abc' >>> b.backward(2) 'bc' """ if j < 0: return self.forward(-j) assert self.__i - j >= 0, 'Cannot move more than %d backward' % self.__i self.__i -= j return self[self.__i:self.__i+j] def peek(self, j=(0, 1)): """ Peek at the next value(s), without advancing the Buffer. Return None if index is out of range. """ try: if isinstance(j, int): return self[self.__i+j] return self[self.__i + j[0]:self.__i + j[1]] except IndexError: return None def __next__(self): """Implements next.""" while self.__i >= len(self.__queue): self.__queue.append(TokenWithPosition(next(self.__iterator), self.__i)) self.__i += 1 return self.__queue[self.__i-1] def __getitem__(self, i): """Supports indexing list >>> b = Buffer('asdf') >>> b[5] Traceback (most recent call last): ... IndexError: list index out of range >>> b[0] 'a' >>> b[1:3] 'sd' >>> b[1:] 'sdf' >>> b[:3] 'asd' >>> b[:] 'asdf' """ if isinstance(i, int): old, j = self.__i, i else: old, j = self.__i, i.stop while j is None or self.__i <= j: try: next(self) except StopIteration: break self.__i = old if isinstance(i, int): return self.__queue[i] return self.__join(self.__queue[i]) def __iter__(self): return self @property def position(self): return self.__i
[docs]class CharToLineOffset(object): """ Utility to convert absolute position in the source file to line_no:char_no_in_line. This can be very useful if we want to parse LaTeX and navigate to some elements in the generated DVI/PDF via SyncTeX. >>> clo = CharToLineOffset('''hello ... world ... I scream for ice cream!''') >>> clo(3) (0, 3) >>> clo(6) (1, 0) >>> clo(12) (2, 0) """ def __init__(self, src): self.line_break_positions = [i for i, c in enumerate(src) if c == '\n'] self.src_len = len(src) def __call__(self, char_pos): line_no = bisect.bisect(self.line_break_positions, char_pos) if line_no == 0: char_no = char_pos elif line_no == len(self.line_break_positions): line_start = self.line_break_positions[-1] char_no = min(char_pos - line_start - 1, self.src_len - line_start) else: char_no = char_pos - self.line_break_positions[line_no - 1] - 1 return line_no, char_no