Source code for py_gql.lang.lexer

# -*- coding: utf-8 -*-
"""
Iterable interface for the GraphQL Language lexer.
"""

from string import ascii_letters
from typing import Container, Iterator, List, Mapping, Optional, Union

from .._string_utils import ensure_unicode, parse_block_string
from ..exc import (
    InvalidCharacter,
    InvalidEscapeSequence,
    NonTerminatedString,
    UnexpectedCharacter,
    UnexpectedEOF,
)
from .token import (
    EOF,
    SOF,
    Ampersand,
    At,
    BlockString,
    BracketClose,
    BracketOpen,
    Colon,
    CurlyClose,
    CurlyOpen,
    Dollar,
    Ellip,
    Equals,
    ExclamationMark,
    Float,
    Integer,
    Name,
    ParenClose,
    ParenOpen,
    Pipe,
    String,
    Token,
)


IGNORED_CHARS = "\n\r\ufeff\t ,"

SYMBOLS = {
    cls.value: cls
    for cls in (
        ExclamationMark,
        Dollar,
        ParenOpen,
        ParenClose,
        BracketOpen,
        BracketClose,
        CurlyOpen,
        CurlyClose,
        Colon,
        Equals,
        At,
        Pipe,
        Ampersand,
    )
}

QUOTED_CHARS = {
    '"': '"',
    "\\": "\\",
    "/": "/",
    "b": "\u0008",
    "f": "\u000c",
    "n": "\n",
    "r": "\r",
    "t": "\t",
}


[docs]class Lexer: """ Iterable GraphQL language lexer / tokenizer. This class is not typically exposed through the parser but can be used independently to build custom parsers. Each call to ``__next__`` will read over a number of characters required to form a valid :class:`py_gql.lang.token.Token` and otherwise raise :class:`~py_gql.exc.GraphQLSyntaxError` if that is not possible. Args: source (Union[str, bytes]): Source string. ``bytes`` objects will be converted to Unicode. """ __slots__ = ("_source", "_len", "_done", "_position", "_started") def __init__(self, source: Union[str, bytes]): self._source = ensure_unicode(source) self._len = len(source) self._done = False self._started = False self._position = 0 def _read_over_whitespace( self, __ignored: Container[str] = IGNORED_CHARS ) -> None: pos = self._position while True: try: char = self._source[pos] except IndexError: break if char in __ignored: pos += 1 elif char == "#": pos += 1 while True: try: char = self._source[pos] except IndexError: break if (char >= " " or char == "\t") and char not in "\n\r": pos += 1 else: break else: break self._position = pos def _read_ellipsis(self) -> Ellip: start = self._position for _ in range(3): try: char = self._source[self._position] except IndexError: raise UnexpectedEOF(self._position, self._source) self._position += 1 if char != ".": raise UnexpectedCharacter( 'Expected "." but found "%s"' % char, self._position, self._source, ) return Ellip(start, self._position) def _read_string(self) -> String: start = self._position self._position += 1 acc = [] # type: List[str] while True: try: char = self._source[self._position] except IndexError: raise NonTerminatedString("", self._position, self._source) self._position += 1 if char == '"': value = "".join(acc) return String(start, self._position, value) elif char == "\\": acc.append(self._read_escape_sequence()) elif char in "\n\r": raise NonTerminatedString("", self._position - 1, self._source) elif not (char >= " " or char == "\t"): raise InvalidCharacter(char, self._position - 1, self._source) else: acc.append(char) def _read_block_string(self) -> BlockString: start = self._position self._position += 3 acc = [] # type: List[str] while True: try: char = self._source[self._position] except IndexError: raise NonTerminatedString("", self._position, self._source) if self._source[self._position : self._position + 3] == '"""': self._position += 3 return BlockString( start, self._position, parse_block_string("".join(acc)) ) self._position += 1 if char == "\\": if self._source[self._position : self._position + 3] == '"""': acc.append('"""') self._position += 3 else: acc.append(char) elif not (char >= " " or char in "\t\n\r"): raise InvalidCharacter(char, self._position - 1, self._source) else: acc.append(char) def _read_escape_sequence( self, __quoted_chars: Mapping[str, str] = QUOTED_CHARS ) -> str: try: char = self._source[self._position] except IndexError: raise NonTerminatedString("", self._position + 1, self._source) self._position += 1 try: return __quoted_chars[char] except KeyError: pass if char == "u": # unicode character: uXXXX return self._read_escaped_unicode() else: raise InvalidEscapeSequence( "\\%s" % char, self._position - 1, self._source ) def _read_escaped_unicode(self) -> str: start = self._position for _ in range(4): try: char = self._source[self._position] except IndexError: raise NonTerminatedString("", self._position + 1, self._source) self._position += 1 if not char.isalnum(): break escape = self._source[start : self._position] if len(escape) != 4: raise InvalidEscapeSequence( "\\u%s" % escape, start - 1, self._source ) try: return str(chr(int(escape, 16))) except ValueError: raise InvalidEscapeSequence( "\\u%s" % escape, start - 1, self._source ) def _read_number(self) -> Union[Integer, Float]: # noqa: C901 start = self._position is_float = False try: char = self._source[self._position] # type: Optional[str] except IndexError: char = None if char == "-": self._position += 1 self._read_over_integer() try: char = self._source[self._position] except IndexError: char = None if char == ".": self._position += 1 is_float = True self._read_over_digits() try: char = self._source[self._position] except IndexError: char = None if char is not None and char in "eE": self._position += 1 is_float = True try: char = self._source[self._position] except IndexError: char = None if char is not None and char in "+-": self._position += 1 self._read_over_integer() # Explicit lookahead restrictions. try: next_char = self._source[self._position] except IndexError: pass else: if next_char == "_" or next_char in ascii_letters: raise UnexpectedCharacter( 'Unexpected character "%s"' % char, self._position, self._source, ) end = self._position value = self._source[start:end] return ( Float(start, end, value) if is_float else Integer(start, end, value) ) def _read_over_integer(self): try: char = self._source[self._position] except IndexError: raise UnexpectedEOF(self._position, self._source) if char == "0": self._position += 1 try: char = self._source[self._position] except IndexError: pass else: if char.isdigit(): raise UnexpectedCharacter( 'Unexpected character "%s"' % char, self._position, self._source, ) else: self._read_over_digits() def _read_over_digits(self): try: char = self._source[self._position] except IndexError: raise UnexpectedEOF(self._position, self._source) if not (char.isdigit()): raise UnexpectedCharacter( 'Unexpected character "%s"' % char, self._position, self._source ) while True: if char is not None and char.isdigit(): self._position += 1 try: char = self._source[self._position] except IndexError: break else: break def _read_name( self, __ascii_letters: Container[str] = ascii_letters ) -> Name: start = self._position while True: try: char = self._source[self._position] except IndexError: break if char == "_" or char in __ascii_letters or char.isdigit(): self._position += 1 else: break return Name(start, self._position, self._source[start : self._position]) def __iter__(self) -> Iterator[Token]: return self def __next__(self) -> Token: """ Advance lexer and return the next :class:`py_gql.lang.Token` instance. Raises: UnexpectedEOF InvalidCharacter UnexpectedCharacter NonTerminatedString """ if self._done: raise StopIteration() if not self._started: self._started = True return SOF(0, 0) self._read_over_whitespace() try: char = self._source[self._position] except IndexError: self._done = True return EOF(self._position, self._position) if not (char >= " " or char == "\t"): self._position += 1 raise InvalidCharacter(char, self._position, self._source) if char in SYMBOLS: start = self._position self._position += 1 return SYMBOLS[char](start, self._position) elif char == ".": return self._read_ellipsis() elif self._source[self._position : self._position + 3] == '"""': return self._read_block_string() elif char == '"': return self._read_string() elif char == "-" or char.isdigit(): return self._read_number() elif char == "_" or char in ascii_letters: return self._read_name() else: raise UnexpectedCharacter( 'Unexpected character "%s"' % char, self._position, self._source )