Source code for pvl.lexer

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Provides lexer functions for PVL."""

# Copyright 2019-2020, ``pvl`` library authors.
#
# Reuse is permitted under the terms of the license.
# The AUTHORS file and the LICENSE file are at the
# top level of this library.


from enum import Enum, auto

from .grammar import PVLGrammar
from .token import Token
from .decoder import PVLDecoder
from .exceptions import LexerError, firstpos


[docs]class Preserve(Enum): FALSE = auto() COMMENT = auto() UNIT = auto() QUOTE = auto() NONDECIMAL = auto()
[docs]def lex_preserve(char: str, lexeme: str, preserve: dict) -> tuple((str, dict)): """Returns a modified *lexeme* string and a modified *preserve* dict in a two-tuple. The modified *lexeme* will always be the concatenation of *lexeme* and *char*. This is a lexer() helper function that is responsible for changing the state of the *preserve* dict, if needed. If the value for 'end' in *preserve* is the same as *char*, then the modified *preserve* will have its 'state' value set to ``Preserve.FALSE`` and its 'end' value set to None, otherwise second item in the returned tuple will be *preserve* unchanged. """ # print(f'in preserve: char "{char}", lexeme "{lexeme}, p {preserve}"') if char == preserve["end"]: return lexeme + char, dict(state=Preserve.FALSE, end=None) else: return lexeme + char, preserve
[docs]def lex_singlechar_comments( char: str, lexeme: str, preserve: dict, comments: dict ) -> tuple((str, dict)): """Returns a modified *lexeme* string and a modified *preserve* dict in a two-tuple. This is a lexer() helper function for determining how to modify *lexeme* and *preserve* based on the single character in *char* which may or may not be a comment character. If the *preserve* 'state' value is Preserve.COMMENT then the value of lex_preserve() is returned. If *char* is among the keys of the *comments* dict, then the returned *lexeme* will be the concatenation of *lexeme* and *char*. returned *preserve* dict will have its 'state' value set to Preserve.COMMENT and its 'end' value set to the value of *comments[char]*. Otherwise return *lexeme* and *preserve* unchanged in the two-tuple. """ if preserve["state"] == Preserve.COMMENT: return lex_preserve(char, lexeme, preserve) elif char in comments: return ( lexeme + char, dict(state=Preserve.COMMENT, end=comments[char]), ) return lexeme, preserve
[docs]def lex_multichar_comments( char: str, prev_char: str, next_char: str, lexeme: str, preserve: dict, comments: tuple(tuple((str, str))) = PVLGrammar().comments, ) -> tuple((str, dict)): """Returns a modified *lexeme* string and a modified *preserve* dict in a two-tuple. This is a lexer() helper function for determining how to modify *lexeme* and *preserve* based on the single character in *char* which may or may not be part of a multi-character comment character group. This function has an internal list of allowed pairs of multi-character comments that it can deal with, if the *comments* tuple contains any two-tuples that cannot be handled, a NotImplementedError will be raised. This function will determine whether to append *char* to *lexeme* or not, and will set the value of the 'state' and 'end' values of *preserve* appropriately. """ # print(f'lex_multichar got these comments: {comments}') if len(comments) == 0: raise ValueError("The variable provided to comments is empty.") allowed_pairs = (("/*", "*/"),) for p in comments: if p not in allowed_pairs: raise NotImplementedError( "Can only handle these " "multicharacter comments: " f"{allowed_pairs}. To handle " "others this class must be extended." ) if ("/*", "*/") in comments: if char == "*": if prev_char == "/": return lexeme + "/*", dict(state=Preserve.COMMENT, end="*/") elif next_char == "/": return lexeme + "*/", dict(state=Preserve.FALSE, end=None) else: return lexeme + "*", preserve elif char == "/": # If part of a comment ignore, and let the char == '*' handler # above deal with it, otherwise add it to the lexeme. if prev_char != "*" and next_char != "*": return lexeme + "/", preserve return lexeme, preserve
[docs]def lex_comment( char: str, prev_char: str, next_char: str, lexeme: str, preserve: dict, c_info: dict, ) -> tuple((str, dict)): """Returns a modified *lexeme* string and a modified *preserve* dict in a two-tuple. This is a lexer() helper function for determining how to modify *lexeme* and *preserve* based on the single character in *char* which may or may not be a comment character. This function just makes the decision about whether to call lex_multichar_comments() or lex_singlechar_comments(), and then returns what they return. """ if char in c_info["multi_chars"]: return lex_multichar_comments( char, prev_char, next_char, lexeme, preserve, comments=c_info["multi_comments"], ) else: return lex_singlechar_comments( char, lexeme, preserve, c_info["single_comments"] )
def _prev_char(s: str, idx: int): """Returns the character from *s* at the position before *idx* or None, if *idx* is zero. """ if idx <= 0: return None else: return s[idx - 1] def _next_char(s: str, idx: int): """Returns the character from *s* at the position after *idx* or None, if *idx* is the last position in *s*. """ try: return s[idx + 1] except IndexError: return None def _prepare_comment_tuples(comments: tuple(tuple((str, str)))) -> dict: """Returns a dict of information based on the contents of *comments*. This is a lexer() helper function to prepare information for lexer(). """ # I initially tried to avoid this function, if you # don't pre-compute this stuff, you end up re-computing # it every time you pass into the lex_comment() function, # which seemed excessive. d = dict() m = list() d["single_comments"] = dict() d["multi_chars"] = set() for pair in comments: if len(pair[0]) == 1: d["single_comments"][pair[0]] = pair[1] else: m.append(pair) for p in pair: d["multi_chars"] |= set(p) d["chars"] = set(d["single_comments"].keys()) d["chars"] |= d["multi_chars"] d["multi_comments"] = tuple(m) # print(d) return d
[docs]def lex_char( char: str, prev_char: str, next_char: str, lexeme: str, preserve: dict, g: PVLGrammar, c_info: dict, ) -> tuple((str, dict)): """Returns a modified *lexeme* string and a modified *preserve* dict in a two-tuple. This is the main lexer() helper function for determining how to modify (or not) *lexeme* and *preserve* based on the single character in *char* and the other values passed into this function. """ # When we are 'in' a comment or a units expression, # we want those to consume everything, regardless. # So we must handle the 'preserve' states first, # and then after that we can check to see if the char # should put us into one of those states. # print(f'lex_char start: char "{char}", lexeme "{lexeme}", "{preserve}"') if preserve["state"] != Preserve.FALSE: if preserve["state"] == Preserve.COMMENT: (lexeme, preserve) = lex_comment( char, prev_char, next_char, lexeme, preserve, c_info ) elif preserve["state"] in ( Preserve.UNIT, Preserve.QUOTE, Preserve.NONDECIMAL, ): (lexeme, preserve) = lex_preserve(char, lexeme, preserve) else: raise ValueError( "{} is not a ".format(preserve["state"]) + "recognized preservation state." ) elif ( char == "#" and g.nondecimal_pre_re.fullmatch(lexeme + char) is not None ): lexeme += char preserve = dict(state=Preserve.NONDECIMAL, end="#") elif char in c_info["chars"]: (lexeme, preserve) = lex_comment( char, prev_char, next_char, lexeme, preserve, c_info ) elif char in g.units_delimiters[0]: lexeme += char preserve = dict(state=Preserve.UNIT, end=g.units_delimiters[1]) elif char in g.quotes: lexeme += char preserve = dict(state=Preserve.QUOTE, end=char) else: if char not in g.whitespace: lexeme += char # adding a char each time # print(f'lex_char end: char "{char}", lexeme "{lexeme}", "{preserve}"') return lexeme, preserve
[docs]def lex_continue( char: str, next_char: str, lexeme: str, token: Token, preserve: dict, g: PVLGrammar, ) -> bool: """Return True if accumulation of *lexeme* should continue based on the values passed into this function, false otherwise. This is a lexer() helper function. """ if next_char is None: return False if not g.char_allowed(next_char): return False if preserve["state"] != Preserve.FALSE: return True # Since Numeric objects can begin with a reserved # character, the reserved characters may split up # the lexeme. if ( char in g.numeric_start_chars and Token(char + next_char, grammar=g).is_numeric() ): return True # Since Non Decimal Numerics can have reserved characters in them. if g.nondecimal_pre_re.fullmatch(lexeme + next_char) is not None: return True # Since the numeric signs could be in the reserved characters, # make sure we can parse scientific notation correctly: if ( char.lower() == "e" and next_char in g.numeric_start_chars and Token(lexeme + next_char + "2", grammar=g).is_numeric() ): return True # Some datetimes can have trailing numeric tz offsets, # if the decoder allows it, this means there could be # a '+' that splits the lexeme that we don't want. if next_char in g.numeric_start_chars and token.is_datetime(): return True return False
[docs]def lexer(s: str, g=PVLGrammar(), d=PVLDecoder()): """This is a generator function that returns pvl.Token objects based on the passed in string, *s*, when the generator's next() is called. A call to send(*t*) will 'return' the value *t* to the generator, which will be yielded upon calling next(). This allows a user to 'peek' at the next token, but return it if they don't like what they see. *g* is expected to be an instance of pvl.grammar, and *d* an instance of pvl.decoder. The lexer will perform differently, given different values of *g* and *d*. """ c_info = _prepare_comment_tuples(g.comments) # print(c_info) lexeme = "" preserve = dict(state=Preserve.FALSE, end=None) for i, char in enumerate(s): if not g.char_allowed(char): raise LexerError( f'The character "{char}" (ord: {ord(char)}) ' " is not allowed by the grammar.", s, i, lexeme, ) prev_char = _prev_char(s, i) next_char = _next_char(s, i) # print(repr(f'lexeme at top: ->{lexeme}<-, char: {char}, ' # f'prev: {prev_char}, next: {next_char}, ' # f'{preserve}')) (lexeme, preserve) = lex_char( char, prev_char, next_char, lexeme, preserve, g, c_info ) # print(repr(f' at bot: ->{lexeme}<-, ' # f' ' # f'{preserve}')) # Now having dealt with char, decide whether to # go on continue accumulating the lexeme, or yield it. if lexeme == "": continue try: # The ``while t is not None: yield None; t = yield(t)`` # construction below allows a user of the lexer to # yield a token, not like what they see, and then use # the generator's send() function to put the token # back into the generator. # # The first ``yield None`` in there allows the call to # send() on this generator to return None, and keep the # value of *t* ready for the next call of next() on the # generator. This is the magic that allows a user to # 'return' a token to the generator. tok = Token(lexeme, grammar=g, decoder=d, pos=firstpos(lexeme, i)) if lex_continue(char, next_char, lexeme, tok, preserve, g): # Any lexeme state that we want to just allow # to run around again and don't want to get # caught by the clause in the elif, should # test true via lex_continue() continue elif ( next_char is None or not g.char_allowed(next_char) or next_char in g.whitespace or next_char in g.reserved_characters or s.startswith(tuple(p[0] for p in g.comments), i + 1) or lexeme.endswith(tuple(p[1] for p in g.comments)) or lexeme in g.reserved_characters or tok.is_quoted_string() ): # print(f'yielding {tok}') t = yield tok while t is not None: yield None t = yield t lexeme = "" else: continue except ValueError as err: raise LexerError(err, s, i, lexeme)