#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Provides lexer functions for PVL."""
# Copyright 2019-2020, ``pvl`` library authors.
#
# Reuse is permitted under the terms of the license.
# The AUTHORS file and the LICENSE file are at the
# top level of this library.
from enum import Enum, auto
from .grammar import PVLGrammar
from .token import Token
from .decoder import PVLDecoder
from .exceptions import LexerError, firstpos
[docs]class Preserve(Enum):
FALSE = auto()
COMMENT = auto()
UNIT = auto()
QUOTE = auto()
NONDECIMAL = auto()
[docs]def lex_preserve(char: str, lexeme: str, preserve: dict) -> tuple((str, dict)):
"""Returns a modified *lexeme* string and a modified *preserve*
dict in a two-tuple. The modified *lexeme* will always be
the concatenation of *lexeme* and *char*.
This is a lexer() helper function that is responsible for
changing the state of the *preserve* dict, if needed.
If the value for 'end' in *preserve* is the same as *char*,
then the modified *preserve* will have its 'state' value
set to ``Preserve.FALSE`` and its 'end' value set to None,
otherwise second item in the returned tuple will be *preserve*
unchanged.
"""
# print(f'in preserve: char "{char}", lexeme "{lexeme}, p {preserve}"')
if char == preserve["end"]:
return lexeme + char, dict(state=Preserve.FALSE, end=None)
else:
return lexeme + char, preserve
def _prev_char(s: str, idx: int):
"""Returns the character from *s* at the position before *idx*
or None, if *idx* is zero.
"""
if idx <= 0:
return None
else:
return s[idx - 1]
def _next_char(s: str, idx: int):
"""Returns the character from *s* at the position after *idx*
or None, if *idx* is the last position in *s*.
"""
try:
return s[idx + 1]
except IndexError:
return None
def _prepare_comment_tuples(comments: tuple(tuple((str, str)))) -> dict:
"""Returns a dict of information based on the contents
of *comments*.
This is a lexer() helper function to prepare information
for lexer().
"""
# I initially tried to avoid this function, if you
# don't pre-compute this stuff, you end up re-computing
# it every time you pass into the lex_comment() function,
# which seemed excessive.
d = dict()
m = list()
d["single_comments"] = dict()
d["multi_chars"] = set()
for pair in comments:
if len(pair[0]) == 1:
d["single_comments"][pair[0]] = pair[1]
else:
m.append(pair)
for p in pair:
d["multi_chars"] |= set(p)
d["chars"] = set(d["single_comments"].keys())
d["chars"] |= d["multi_chars"]
d["multi_comments"] = tuple(m)
# print(d)
return d
[docs]def lex_char(
char: str,
prev_char: str,
next_char: str,
lexeme: str,
preserve: dict,
g: PVLGrammar,
c_info: dict,
) -> tuple((str, dict)):
"""Returns a modified *lexeme* string and a modified *preserve*
dict in a two-tuple.
This is the main lexer() helper function for determining how
to modify (or not) *lexeme* and *preserve* based on the
single character in *char* and the other values passed into
this function.
"""
# When we are 'in' a comment or a units expression,
# we want those to consume everything, regardless.
# So we must handle the 'preserve' states first,
# and then after that we can check to see if the char
# should put us into one of those states.
# print(f'lex_char start: char "{char}", lexeme "{lexeme}", "{preserve}"')
if preserve["state"] != Preserve.FALSE:
if preserve["state"] == Preserve.COMMENT:
(lexeme, preserve) = lex_comment(
char, prev_char, next_char, lexeme, preserve, c_info
)
elif preserve["state"] in (
Preserve.UNIT,
Preserve.QUOTE,
Preserve.NONDECIMAL,
):
(lexeme, preserve) = lex_preserve(char, lexeme, preserve)
else:
raise ValueError(
"{} is not a ".format(preserve["state"])
+ "recognized preservation state."
)
elif (
char == "#"
and g.nondecimal_pre_re.fullmatch(lexeme + char) is not None
):
lexeme += char
preserve = dict(state=Preserve.NONDECIMAL, end="#")
elif char in c_info["chars"]:
(lexeme, preserve) = lex_comment(
char, prev_char, next_char, lexeme, preserve, c_info
)
elif char in g.units_delimiters[0]:
lexeme += char
preserve = dict(state=Preserve.UNIT, end=g.units_delimiters[1])
elif char in g.quotes:
lexeme += char
preserve = dict(state=Preserve.QUOTE, end=char)
else:
if char not in g.whitespace:
lexeme += char # adding a char each time
# print(f'lex_char end: char "{char}", lexeme "{lexeme}", "{preserve}"')
return lexeme, preserve
[docs]def lex_continue(
char: str,
next_char: str,
lexeme: str,
token: Token,
preserve: dict,
g: PVLGrammar,
) -> bool:
"""Return True if accumulation of *lexeme* should continue based
on the values passed into this function, false otherwise.
This is a lexer() helper function.
"""
if next_char is None:
return False
if not g.char_allowed(next_char):
return False
if preserve["state"] != Preserve.FALSE:
return True
# Since Numeric objects can begin with a reserved
# character, the reserved characters may split up
# the lexeme.
if (
char in g.numeric_start_chars
and Token(char + next_char, grammar=g).is_numeric()
):
return True
# Since Non Decimal Numerics can have reserved characters in them.
if g.nondecimal_pre_re.fullmatch(lexeme + next_char) is not None:
return True
# Since the numeric signs could be in the reserved characters,
# make sure we can parse scientific notation correctly:
if (
char.lower() == "e"
and next_char in g.numeric_start_chars
and Token(lexeme + next_char + "2", grammar=g).is_numeric()
):
return True
# Some datetimes can have trailing numeric tz offsets,
# if the decoder allows it, this means there could be
# a '+' that splits the lexeme that we don't want.
if next_char in g.numeric_start_chars and token.is_datetime():
return True
return False
[docs]def lexer(s: str, g=PVLGrammar(), d=PVLDecoder()):
"""This is a generator function that returns pvl.Token objects
based on the passed in string, *s*, when the generator's
next() is called.
A call to send(*t*) will 'return' the value *t* to the
generator, which will be yielded upon calling next().
This allows a user to 'peek' at the next token, but return it
if they don't like what they see.
*g* is expected to be an instance of pvl.grammar, and *d* an
instance of pvl.decoder. The lexer will perform differently,
given different values of *g* and *d*.
"""
c_info = _prepare_comment_tuples(g.comments)
# print(c_info)
lexeme = ""
preserve = dict(state=Preserve.FALSE, end=None)
for i, char in enumerate(s):
if not g.char_allowed(char):
raise LexerError(
f'The character "{char}" (ord: {ord(char)}) '
" is not allowed by the grammar.",
s,
i,
lexeme,
)
prev_char = _prev_char(s, i)
next_char = _next_char(s, i)
# print(repr(f'lexeme at top: ->{lexeme}<-, char: {char}, '
# f'prev: {prev_char}, next: {next_char}, '
# f'{preserve}'))
(lexeme, preserve) = lex_char(
char, prev_char, next_char, lexeme, preserve, g, c_info
)
# print(repr(f' at bot: ->{lexeme}<-, '
# f' '
# f'{preserve}'))
# Now having dealt with char, decide whether to
# go on continue accumulating the lexeme, or yield it.
if lexeme == "":
continue
try:
# The ``while t is not None: yield None; t = yield(t)``
# construction below allows a user of the lexer to
# yield a token, not like what they see, and then use
# the generator's send() function to put the token
# back into the generator.
#
# The first ``yield None`` in there allows the call to
# send() on this generator to return None, and keep the
# value of *t* ready for the next call of next() on the
# generator. This is the magic that allows a user to
# 'return' a token to the generator.
tok = Token(lexeme, grammar=g, decoder=d, pos=firstpos(lexeme, i))
if lex_continue(char, next_char, lexeme, tok, preserve, g):
# Any lexeme state that we want to just allow
# to run around again and don't want to get
# caught by the clause in the elif, should
# test true via lex_continue()
continue
elif (
next_char is None
or not g.char_allowed(next_char)
or next_char in g.whitespace
or next_char in g.reserved_characters
or s.startswith(tuple(p[0] for p in g.comments), i + 1)
or lexeme.endswith(tuple(p[1] for p in g.comments))
or lexeme in g.reserved_characters
or tok.is_quoted_string()
):
# print(f'yielding {tok}')
t = yield tok
while t is not None:
yield None
t = yield t
lexeme = ""
else:
continue
except ValueError as err:
raise LexerError(err, s, i, lexeme)