Source code for pvl.token

# -*- coding: utf-8 -*-

# Copyright 2019-2020, ``pvl`` library authors.
#
# Reuse is permitted under the terms of the license.
# The AUTHORS file and the LICENSE file are at the
# top level of this library.


from .decoder import PVLDecoder
from .grammar import PVLGrammar


[docs]class Token(str):
    """A PVL-aware string.

    :var content: A string that is the Token text.

    :var grammar: A pvl.grammar object, if None or not specified, it will
                  be set to the grammar parameter of *decoder* (if
                  *decoder* is not None) or will default to PVLGrammar().

    :var decoder: A pvl.decoder object, defaults to
                  PVLDecoder(grammar=*grammar*).

    :var pos: Integer that describes the starting position of this
              Token in the source string, defaults to zero.
    """

    def __new__(cls, content, grammar=None, decoder=None, pos=0):
        return str.__new__(cls, content)

    def __init__(self, content, grammar=None, decoder=None, pos=0):
        if grammar is None:
            if decoder is not None:
                self.grammar = decoder.grammar
            else:
                self.grammar = PVLGrammar()
        elif isinstance(grammar, PVLGrammar):
            self.grammar = grammar
        else:
            raise TypeError("The grammar object is not of type PVLGrammar.")

        if decoder is None:
            self.decoder = PVLDecoder(grammar=self.grammar)
        elif isinstance(decoder, PVLDecoder):
            self.decoder = decoder
        else:
            raise TypeError("The decoder object is not of type PVLDecoder.")

        self.pos = pos

    def __repr__(self):
        return f"{self.__class__.__name__}('{self}', " f"'{self.grammar}')"

    def __index__(self):
        if self.is_decimal():
            try:
                return self.decoder.decode_non_decimal(str(self))
            except ValueError:
                if int(str(self)) == float(str(self)):
                    return int(str(self))

        raise ValueError(f"The {self:r} cannot be used as an index.")

    def __float__(self):
        return float(self.decoder.decode_decimal(str(self)))

[docs]    def split(self, sep=None, maxsplit=-1) -> list:
        """Extends ``str.split()`` that calling split() on a Token
        returns a list of Tokens.
        """
        str_list = super().split(sep, maxsplit)
        tkn_list = list()
        for t in str_list:
            tkn_list.append(
                Token(t, grammar=self.grammar, decoder=self.decoder)
            )
        return tkn_list

[docs]    def replace(self, *args):
        """Extends ``str.replace()`` to return a Token."""
        return Token(
            super().replace(*args), grammar=self.grammar, decoder=self.decoder
        )

[docs]    def lstrip(self, chars=None):
        """Extends ``str.lstrip()`` to strip whitespace according
        to the definition of whitespace in the Token's grammar
        instead of the default Python whitespace definition.
        """
        return self._strip(super().lstrip, chars)

[docs]    def rstrip(self, chars=None):
        """Extends ``str.rstrip()`` to strip whitespace according
        to the definition of whitespace in the Token's grammar
        instead of the default Python whitespace definition.
        """
        return self._strip(super().rstrip, chars)

[docs]    def strip(self, chars=None):
        """Extends ``str.strip()`` to strip whitespace according
        to the definition of whitespace in the Token's grammar
        instead of the default Python whitespace definition.
        """
        return self._strip(super().strip, chars)

    def _strip(self, strip_func, chars=None):
        # Shared functionality for the various strip functions.
        if chars is None:
            chars = "".join(self.grammar.whitespace)
        return Token(
            strip_func(chars), grammar=self.grammar, decoder=self.decoder
        )

[docs]    def isspace(self) -> bool:
        """Overrides ``str.isspace()`` to be the same as Token's
        is_space() function, so that we don't get inconsisent
        behavior if someone forgets an underbar.
        """
        # So that we don't get inconsisent behavior
        # if someone forgets an underbar.
        return self.is_space()

[docs]    def is_space(self) -> bool:
        """Return true if the Token contains whitespace according
        to the definition of whitespace in the Token's grammar
        and there is at least one character, false otherwise.
        """
        if len(self) == 0:
            return False

        return all(c in self.grammar.whitespace for c in self)

[docs]    def is_WSC(self) -> bool:
        """Return true if the Token is white space characters or comments
        according to the Token's grammar, false otherwise.
        """
        if self.is_comment():
            return True

        if self.is_space():
            return True

        for ws in reversed(self.grammar.whitespace):
            temp = self.replace(ws, " ")

        return all(t.is_comment() for t in temp.split())

[docs]    def is_comment(self) -> bool:
        """Return true if the Token is a comment according to the
        Token's grammar (defined as beginning and ending with
        comment delimieters), false otherwise.
        """
        for pair in self.grammar.comments:
            if self.startswith(pair[0]) and self.endswith(pair[1]):
                return True
        return False

[docs]    def is_quote(self) -> bool:
        """Return true if the Token is a quote character
        according to the Token's grammar, false otherwise.
        """
        if self in self.grammar.quotes:
            return True
        else:
            return False

[docs]    def is_quoted_string(self) -> bool:
        """Return true if the Token can be converted to a quoted
        string by the Token's decoder, false otherwise.
        """
        try:
            self.decoder.decode_quoted_string(self)
            return True
        except ValueError:
            return False

[docs]    def is_delimiter(self) -> bool:
        """Return true if the Token is a delimiter character
        (e.g. the ';' in PVL) according to the Token's grammar,
        false otherwise.
        """
        if self in self.grammar.delimiters:
            return True
        return False

[docs]    def is_begin_aggregation(self) -> bool:
        """Return true if the Token is a begin aggregation
        keyword (e.g. 'BEGIN_GROUP' in PVL) according to
        the Token's grammar, false otherwise.
        """
        for k in self.grammar.aggregation_keywords.keys():
            if self.casefold() == k.casefold():
                return True
        return False

[docs]    def is_unquoted_string(self) -> bool:
        """Return false if the Token has any
        reserved characters, comment characters, whitespace
        characters or could be interpreted as a number,
        date, or time according to the Token's grammar,
        true otherwise.
        """
        for char in self.grammar.reserved_characters:
            if char in self:
                return False

        for pair in self.grammar.comments:
            if pair[0] in self:
                return False
            if pair[1] in self:
                return False

        if self.is_numeric() or self.is_datetime():
            return False

        for char in self.grammar.whitespace:
            if char in self:
                return False

        return True

[docs]    def is_string(self) -> bool:
        """Return true if either the Token's is_quoted_string()
        or is_unquoted_string() return true, false otherwise.
        """
        if self.is_quoted_string() or self.is_unquoted_string():
            return True
        return False

[docs]    def is_parameter_name(self) -> bool:
        """Return true if the Token is an unquoted string that
        isn't a reserved_keyword according to the Token's
        grammar, false otherwise.
        """
        for word in self.grammar.reserved_keywords:
            if word.casefold() == self.casefold():
                return False

        return self.is_unquoted_string()

[docs]    def is_end_statement(self) -> bool:
        """Return true if the Token matches an end statement
        from its grammar, false otherwise.
        """
        for e in self.grammar.end_statements:
            if e.casefold() == self.casefold():
                return True
        return False

[docs]    def isnumeric(self) -> bool:
        """Overrides ``str.isnumeric()`` to be the same as Token's
        is_numeric() function, so that we don't get inconsisent behavior
        if someone forgets an underbar.
        """
        return self.is_numeric()

[docs]    def is_numeric(self) -> bool:
        """Return true if the Token's is_decimal() or is_non_decimal()
        functions return true, false otherwise.
        """
        if self.is_decimal() or self.is_non_decimal():
            return True

        return False

[docs]    def is_decimal(self) -> bool:
        """Return true if the Token's decoder can convert the Token
        to a decimal value, false otherwise.
        """
        try:
            self.decoder.decode_decimal(self)
            return True
        except ValueError:
            return False

[docs]    def is_non_decimal(self) -> bool:
        """Return true if the Token's decoder can convert the Token
        to a numeric non-decimal value, false otherwise.
        """
        try:
            self.decoder.decode_non_decimal(self)
            return True
        except ValueError:
            return False

    # Took these out, since some grammars allow a much wider
    # range of radix values.
    #
    # def is_binary(self) -> bool:
    #     if self.grammar.binary_re.fullmatch(self) is None:
    #         return False
    #     else:
    #         return True

    # def is_octal(self) -> bool:
    #     if self.grammar.octal_re.fullmatch(self) is None:
    #         return False
    #     else:
    #         return True

    # def is_hex(self) -> bool:
    #     if self.grammar.hex_re.fullmatch(self) is None:
    #         return False
    #     else:
    #         return True

[docs]    def is_datetime(self) -> bool:
        """Return true if the Token's decoder can convert the Token
        to a datetime, false otherwise.

        Separate is_date() or is_time() functions aren't needed,
        since PVL parsing doesn't distinguish between them.
        If a user needs that distinction the decoder's
        decode_datetime(self) function should return a datetime
        time, date, or datetime object, as appropriate, and
        a user can use isinstance() to check.
        """
        try:
            self.decoder.decode_datetime(self)
            return True
        except ValueError:
            return False

[docs]    def is_simple_value(self) -> bool:
        """Return true if the Token's decoder can convert the Token
        to a 'simple value', however the decoder defines that, false
        otherwise.
        """
        try:
            self.decoder.decode_simple_value(self)
            return True
        except ValueError:
            return False