Source code for TexSoup.utils

import bisect
import functools


##############
# Decorators #
##############


def to_buffer(f):
    """
    Decorator converting all strings and iterators/iterables into Buffers.
    """
    @functools.wraps(f)
    def wrap(*args, **kwargs):
        iterator = kwargs.get('iterator', args[0])
        if not isinstance(iterator, Buffer):
            iterator = Buffer(iterator)
        return f(iterator, *args[1:], **kwargs)
    return wrap

#########################
# Generalized Utilities #
#########################


[docs]class TokenWithPosition(str):
    """Enhanced string object with knowledge of global position."""

    # noinspection PyArgumentList
    def __new__(cls, text, position=None):
        """Initializer for pseudo-string object.

        :param text: The original string
        :param position: Position in the original buffer
        """
        self = str.__new__(cls, text)
        if isinstance(text, TokenWithPosition):
            self.text, self.position = text.text, text.position
        else:
            self.text = text
            self.position = position
        return self

    def __repr__(self):
        return repr(self.text)

    def __str__(self):
        return str(self.text)

    def __getattr__(self, name):
        return getattr(self.text, name)

    def __eq__(self, other):
        """
        >>> TokenWithPosition('asdf', 0) == TokenWithPosition('asdf', 2)
        True
        >>> TokenWithPosition('asdf', 0) == TokenWithPosition('asd', 0)
        False
        """
        if isinstance(other, TokenWithPosition):
            return self.text == other.text
        else:
            return self.text == other

    def __hash__(self):
        return hash(self.text)

    def __add__(self, other):
        """Implements addition in the form of TextWithPosition(...) + (obj).

        >>> t1 = TokenWithPosition('as', 0) + TokenWithPosition('df', 1)
        >>> str(t1)
        'asdf'
        >>> t1.position
        0
        >>> t2 = TokenWithPosition('as', 1) + 'df'
        >>> str(t2)
        'asdf'
        >>> t3 = TokenWithPosition(t2)
        >>> t3.position
        1
        """

        if isinstance(other, TokenWithPosition):
            return TokenWithPosition(self.text + other.text,
                                     self.position)
        else:
            return TokenWithPosition(self.text + other,
                                     self.position)

    def __radd__(self, other):
        """Implements addition in the form of (obj) + TextWithPosition(...).

        Note that if the first element is TokenWithPosition,
        TokenWithPosition(...).__add__(...) will be used. As a result, we
        can assume WLOG that `other` is a type other than TokenWithPosition.

        >>> t1 = TokenWithPosition('as', 2) + TokenWithPosition('dfg', 2)
        >>> str(t1)
        'asdfg'
        >>> t1.position
        2
        >>> t2 = 'as' + TokenWithPosition('dfg', 2)
        >>> str(t2)
        'asdfg'
        >>> t2.position
        0
        """
        return TokenWithPosition(other + self.text,
                                 self.position - len(other))

    def __iadd__(self, other):
        """Implements addition in the form of TextWithPosition(...) += ...

        >>> t1 = TokenWithPosition('as', 0)
        >>> t1 += 'df'
        >>> str(t1)
        'asdf'
        >>> t1.position
        0
        """
        if isinstance(other, TokenWithPosition):
            new = TokenWithPosition(self.text + other.text, self.position)
        else:
            new = TokenWithPosition(self.text + other, self.position)
        return new

    @classmethod
    def join(cls, tokens, glue=''):
        if len(tokens) > 0:
            return TokenWithPosition(glue.join(t.text for t in tokens),
                                     tokens[0].position)
        else:
            return ''

    def __bool__(self):
        return bool(self.text)

    def __contains__(self, item):
        """
        >>> 'rg' in TokenWithPosition('corgi', 0)
        True
        >>> 'reg' in TokenWithPosition('corgi', 0)
        False
        >>> TokenWithPosition('rg', 0) in TokenWithPosition('corgi', 0)
        True
        """
        if isinstance(item, TokenWithPosition):
            return item.text in self.text
        return item in self.text

    def __iter__(self):
        """
        >>> list(TokenWithPosition('asdf', 0))
        ['a', 's', 'd', 'f']
        """
        return iter(self.__iter())

    def __iter(self):
        for i, c in enumerate(self.text):
            yield TokenWithPosition(c, self.position + i)

    def __getitem__(self, i):
        """Access characters in object just as with strings.

        >>> t1 = TokenWithPosition('asdf', 2)
        >>> t1[0]
        'a'
        >>> t1[-1]
        'f'
        >>> t1[:]
        'asdf'
        """
        if isinstance(i, int):
            start = i
        else:
            start = i.start
        if start is None:
            start = 0
        if start < 0:
            start = len(self.text) + start
        return TokenWithPosition(self.text[i], self.position + start)

    def split(self, sep=None, maxsplit=-1):
        result = []
        split_res = self.text.split(sep=sep, maxsplit=maxsplit)
        txt = self.text
        cur_offset = 0
        for s in split_res:
            cur_offset = txt.find(s, cur_offset)
            result.append(TokenWithPosition(s, self.position + cur_offset))
        return result

    def strip(self, *args, **kwargs):
        stripped = self.text.strip(*args, **kwargs)
        offset = self.text.find(stripped)
        return TokenWithPosition(stripped, self.position + offset)

[docs]    def lstrip(self, *args, **kwargs):
        """Strip leading whitespace for text.

        >>> t = TokenWithPosition('  asdf  ', 2)
        >>> t.lstrip()
        'asdf  '
        """
        stripped = self.text.lstrip(*args, **kwargs)
        offset = self.text.find(stripped)
        return TokenWithPosition(stripped, self.position + offset)

[docs]    def rstrip(self, *args, **kwargs):
        """Strip trailing whitespace for text.

        >>> t = TokenWithPosition('  asdf  ', 2)
        >>> t.rstrip()
        '  asdf'
        """
        stripped = self.text.rstrip(*args, **kwargs)
        offset = self.text.find(stripped)
        return TokenWithPosition(stripped, self.position + offset)


# General Buffer class
class Buffer:
    """Converts string or iterable into a navigable iterator of strings

    >>> b1 = Buffer("012345")
    >>> next(b1)
    '0'
    >>> b1.forward()
    '1'
    >>> b1.endswith('1')
    True
    >>> b1.backward(2)
    '01'
    >>> b1.peek()
    '0'
    >>> b1.peek(2)
    '2'
    >>> b1.peek((0, 2))
    '01'
    >>> b1.startswith('01')
    True
    >>> b1[2:4]
    '23'
    >>> Buffer('asdf')[:10]
    'asdf'
    """

    def __init__(self, iterator, join=TokenWithPosition.join):
        """Initialization for Buffer

        :param iterator: iterator or iterable
        :param func join: function to join multiple buffer elements
        """
        assert hasattr(iterator, '__iter__'), 'Must be an iterable.'
        self.__iterator = iter(iterator)
        self.__queue = []
        self.__i = 0
        self.__join = join

    # noinspection PyPep8Naming
    def hasNext(self):
        """Returns whether or not there is another element."""
        return bool(self.peek())

    def startswith(self, s):
        """
        Check if iterator starts with s, beginning from the current position
        """
        return self.peek((0, len(s))).startswith(s)

    def endswith(self, s):
        """
        Check if iterator ends with s, ending at current position
        """
        return self.peek((-len(s), 0)).endswith(s)

    def forward(self, j=1):
        """Move forward by j steps.

        >>> b = Buffer('abcdef')
        >>> b.forward(3)
        'abc'
        >>> b.forward(-2)
        'bc'
        """
        if j < 0:
            return self.backward(-j)
        self.__i += j
        return self[self.__i-j:self.__i]

    def num_forward_until(self, condition):
        """Forward until one of the provided matches is found.

        :param condition: set of valid strings
        """
        i, c = 0, ''
        while self.hasNext() and not condition(self.peek()):
            c += self.forward(1)
            i += 1
        assert self.backward(i) == c
        return i

    def forward_until(self, condition):
        """Forward until one of the provided matches is found.

        The returned string contains all characters found *before the condition
        was met. In other words, the condition will be true for the remainder
        of the buffer.

        :param condition: set of valid strings
        """
        c = TokenWithPosition('', self.peek().position)
        while self.hasNext() and not condition(self.peek()):
            c += self.forward(1)
        return c

    def backward(self, j=1):
        """Move backward by j steps.

        >>> b = Buffer('abcdef')
        >>> b.backward(-3)
        'abc'
        >>> b.backward(2)
        'bc'
        """
        if j < 0:
            return self.forward(-j)
        assert self.__i - j >= 0, 'Cannot move more than %d backward' % self.__i
        self.__i -= j
        return self[self.__i:self.__i+j]

    def peek(self, j=(0, 1)):
        """
        Peek at the next value(s), without advancing the Buffer.
        Return None if index is out of range.
        """
        try:
            if isinstance(j, int):
                return self[self.__i+j]
            return self[self.__i + j[0]:self.__i + j[1]]
        except IndexError:
            return None

    def __next__(self):
        """Implements next."""
        while self.__i >= len(self.__queue):
            self.__queue.append(TokenWithPosition(next(self.__iterator), self.__i))
        self.__i += 1
        return self.__queue[self.__i-1]

    def __getitem__(self, i):
        """Supports indexing list

        >>> b = Buffer('asdf')
        >>> b[5]
        Traceback (most recent call last):
            ...
        IndexError: list index out of range
        >>> b[0]
        'a'
        >>> b[1:3]
        'sd'
        >>> b[1:]
        'sdf'
        >>> b[:3]
        'asd'
        >>> b[:]
        'asdf'
        """
        if isinstance(i, int):
            old, j = self.__i, i
        else:
            old, j = self.__i, i.stop

        while j is None or self.__i <= j:
            try:
                next(self)
            except StopIteration:
                break
        self.__i = old
        if isinstance(i, int):
            return self.__queue[i]
        return self.__join(self.__queue[i])

    def __iter__(self):
        return self

    @property
    def position(self):
        return self.__i


[docs]class CharToLineOffset(object):
    """
    Utility to convert absolute position in the source file
    to line_no:char_no_in_line.
    This can be very useful if we want to parse LaTeX and
    navigate to some elements in the generated DVI/PDF via SyncTeX.

    >>> clo = CharToLineOffset('''hello
    ... world
    ... I scream for ice cream!''')
    >>> clo(3)
    (0, 3)
    >>> clo(6)
    (1, 0)
    >>> clo(12)
    (2, 0)
    """
    def __init__(self, src):
        self.line_break_positions = [i for i, c in enumerate(src) if c == '\n']
        self.src_len = len(src)

    def __call__(self, char_pos):
        line_no = bisect.bisect(self.line_break_positions, char_pos)
        if line_no == 0:
            char_no = char_pos
        elif line_no == len(self.line_break_positions):
            line_start = self.line_break_positions[-1]
            char_no = min(char_pos - line_start - 1, self.src_len - line_start)
        else:
            char_no = char_pos - self.line_break_positions[line_no - 1] - 1
        return line_no, char_no