LCOV - code coverage report
Current view: top level - src/plugins/yanlr - yaml_lexer.hpp (source / functions) Hit Total Coverage
Test: coverage-filtered.info Lines: 2 2 100.0 %
Date: 2019-09-12 12:28:41 Functions: 1 2 50.0 %

          Line data    Source code
       1             : /**
       2             :  * @file
       3             :  *
       4             :  * @brief This file specifies a lexer that scans a subset of YAML.
       5             :  *
       6             :  * The lexer uses the same idea as the scanner of `libyaml` (and various other
       7             :  * YAML libs) to detect simple keys (keys with no `?` prefix).
       8             :  *
       9             :  * For a detailed explanation of the algorithm, I recommend to take a look at
      10             :  * the scanner of
      11             :  *
      12             :  * - SnakeYAML Engine:
      13             :  *   https://bitbucket.org/asomov/snakeyaml-engine
      14             :  * - or LLVM’s YAML library:
      15             :  *   https://github.com/llvm-mirror/llvm/blob/master/lib/Support/YAMLParser.cpp
      16             :  *
      17             :  * .
      18             :  *
      19             :  * @copyright BSD License (see LICENSE.md or https://www.libelektra.org)
      20             :  */
      21             : 
      22             : // -- Imports ------------------------------------------------------------------
      23             : 
      24             : #include <antlr4-runtime.h>
      25             : 
      26             : #include <kdblogger.h>
      27             : 
      28             : using std::deque;
      29             : using std::pair;
      30             : using std::shared_ptr;
      31             : using std::stack;
      32             : using std::string;
      33             : using std::unique_ptr;
      34             : 
      35             : namespace yanlr
      36             : {
      37             : 
      38             : // -- Class --------------------------------------------------------------------
      39             : 
      40         124 : class YAMLLexer : public antlr4::TokenSource
      41             : {
      42             :         /** This class stores information about indentation that starts a new block node. */
      43             :         class Level
      44             :         {
      45             :         public:
      46             :                 /** This enumeration specifies the type of a block node. */
      47             :                 enum class Type
      48             :                 {
      49             :                         MAP,      ///< The current indentation starts a block map
      50             :                         SEQUENCE, ///< The current indentation starts a block sequence
      51             :                         OTHER     ///< The current indentation starts a block scalar
      52             :                 };
      53             :                 size_t indent = 0;
      54             :                 Type type = Level::Type::OTHER;
      55             : 
      56             :                 /**
      57             :                  * @brief This constructor creates a level object from the given arguments.
      58             :                  *
      59             :                  * @param indentation This number specifies the number of spaces used to start this level object.
      60             :                  * @param levelType This argument specifies the type of node `indentation` created.
      61             :                  */
      62          86 :                 Level (size_t indentation, Level::Type levelType = Level::Type::OTHER) : indent{ indentation }, type{ levelType }
      63             :                 {
      64             :                 }
      65             :         };
      66             : 
      67             :         /** This structure represents the position inside the input. */
      68             :         struct Position
      69             :         {
      70             :                 /** This parameter stores the offset to the start of the input in bytes. */
      71             :                 size_t index;
      72             :                 /** This parameter stores the line number. */
      73             :                 size_t line;
      74             :                 /** This parameter stores the column offset inside `line`. */
      75             :                 size_t column;
      76             : 
      77             :                 /**
      78             :                  * @brief This constructor creates a position from the given arguments.
      79             :                  *
      80             :                  * @param byteIndex This number specifies the byte offset of the position relative to the start of the input.
      81             :                  * @param lineNumber This number specifies the line number of the position.
      82             :                  * @param columnOffset This number specifies the offset to the beginning of the line.
      83             :                  */
      84             :                 Position (size_t byteIndex, size_t lineNumber, size_t columnOffset);
      85             :         };
      86             : 
      87             :         /** This variable stores the input that this lexer scans. */
      88             :         antlr4::CharStream * input;
      89             : 
      90             :         /** This queue stores the list of tokens produced by the lexer. */
      91             :         deque<unique_ptr<antlr4::CommonToken>> tokens;
      92             : 
      93             :         /** The lexer uses this factory to produce tokens. */
      94             :         Ref<antlr4::TokenFactory<antlr4::CommonToken>> factory = antlr4::CommonTokenFactory::DEFAULT;
      95             : 
      96             :         /** This pair stores the token source (this lexer) and the current `input`. */
      97             :         pair<antlr4::TokenSource *, antlr4::CharStream *> source;
      98             : 
      99             :         /**
     100             :          * This variable saves the current line position of the lexer inside
     101             :          * `input`.
     102             :          */
     103             :         size_t line = 1;
     104             : 
     105             :         /**
     106             :          * This number stores the current character position of the lexer inside of
     107             :          * `line`.
     108             :          */
     109             :         size_t column = 1;
     110             : 
     111             :         /**
     112             :          * This counter stores the number of tokens already emitted by the lexer.
     113             :          * The lexer needs this variable, to keep track of the insertion point of
     114             :          * `KEY` tokens in the token queue.
     115             :          */
     116             :         size_t tokensEmitted = 0;
     117             : 
     118             :         /**
     119             :          * This stack stores the indentation (in number of characters) and block
     120             :          * type for each block node.
     121             :          */
     122             :         stack<Level> levels{ deque<Level>{ Level{ 0 } } };
     123             : 
     124             :         /**
     125             :          * This boolean specifies if the lexer has already scanned the whole input or
     126             :          * not.
     127             :          */
     128             :         bool done = false;
     129             : 
     130             :         /**
     131             :          * This pair stores a simple key candidate token (first part) and its
     132             :          * position in the token queue (second part).
     133             :          *
     134             :          * Since the lexer only supports block syntax for mappings and sequences we
     135             :          * use a single token here. If we need support for flow collections we have
     136             :          * to store a candidate for each flow level (block context = flow level 0).
     137             :          */
     138             :         pair<unique_ptr<antlr4::CommonToken>, size_t> simpleKey;
     139             : 
     140             :         /**
     141             :          * @brief This function returns the current position of the lexer inside the input.
     142             :          *
     143             :          * @return A position containing the current byte index, line number and column offset.
     144             :          */
     145             :         Position getPosition ();
     146             : 
     147             :         /**
     148             :          * @brief This function creates a new token with the specified parameters.
     149             :          *
     150             :          * @param type This parameter specifies the type of the token this function
     151             :          *             should create.
     152             :          * @param start This variable specifies the start position of the returned token
     153             :          *              inside the character stream `input`.
     154             :          * @param stop This number specifies the stop index of the returned token
     155             :          *             inside the character stream `input`.
     156             :          * @param text This string specifies the text of the returned token.
     157             :          *
     158             :          * @return A token with the specified parameters
     159             :          */
     160             :         unique_ptr<antlr4::CommonToken> commonToken (size_t type, Position const & start, size_t stop, string text);
     161             : 
     162             :         /**
     163             :          * @brief This function adds an indentation value if the given value is smaller
     164             :          *        than the current indentation.
     165             :          *
     166             :          * @param lineIndex This parameter specifies the indentation value that this
     167             :          *                  function compares to the current indentation.
     168             :          *
     169             :          * @param type This value specifies the block collection type that
     170             :          *             `lineIndex` might start.
     171             :          *
     172             :          * @retval true If the function added an indentation value
     173             :          *         false Otherwise
     174             :          */
     175             :         bool addIndentation (size_t const column, Level::Type type);
     176             : 
     177             :         /**
     178             :          * @brief This function checks if the lexer needs to scan additional tokens.
     179             :          *
     180             :          * @retval true If the lexer should fetch additional tokens
     181             :          *         false Otherwise
     182             :          */
     183             :         bool needMoreTokens () const;
     184             : 
     185             :         /**
     186             :          * @brief This method adds new tokens to the token stream.
     187             :          */
     188             :         void fetchTokens ();
     189             : 
     190             :         /**
     191             :          * @brief This method consumes characters from the input stream keeping
     192             :          *        track of line and column numbers.
     193             :          *
     194             :          * @param characters This parameter specifies the number of characters the
     195             :          *                   the function should consume.
     196             :          */
     197             :         void forward (size_t const characters);
     198             : 
     199             :         /**
     200             :          * @brief This method removes uninteresting characters from the input.
     201             :          */
     202             :         void scanToNextToken ();
     203             : 
     204             :         /**
     205             :          * @brief This method checks if the input at the specified offset starts a key
     206             :          *        value token.
     207             :          *
     208             :          * @param offset This parameter specifies an offset to the current position,
     209             :          *               where this function will look for a key value token.
     210             :          *
     211             :          * @retval true If the input matches a key value token
     212             :          *         false Otherwise
     213             :          */
     214             :         bool isValue (size_t const offset = 1) const;
     215             : 
     216             :         /**
     217             :          * @brief This method checks if the current input starts a list element.
     218             :          *
     219             :          * @retval true If the input matches a list element token
     220             :          *         false Otherwise
     221             :          */
     222             :         bool isElement () const;
     223             : 
     224             :         /**
     225             :          * @brief This method checks if the input at the specified offset starts a
     226             :          *        line comment.
     227             :          *
     228             :          * @param offset This parameter specifies an offset to the current position,
     229             :          *               where this function will look for a comment token.
     230             :          *
     231             :          * @retval true If the input matches a comment token
     232             :          *         false Otherwise
     233             :          */
     234             :         bool isComment (size_t const offset) const;
     235             : 
     236             :         /**
     237             :          * @brief This method saves a token for a simple key candidate located at the
     238             :          *        current input position.
     239             :          */
     240             :         void addSimpleKeyCandidate ();
     241             : 
     242             :         /**
     243             :          * @brief This method adds block closing tokens to the token queue, if the
     244             :          *        indentation decreased.
     245             :          *
     246             :          * @param lineIndex This parameter specifies the column (indentation in number
     247             :          *                  of spaces) for which this method should add block end
     248             :          *                  tokens.
     249             :          */
     250             :         void addBlockEnd (size_t const lineIndex);
     251             : 
     252             :         /**
     253             :          * @brief This method adds the token for the start of the YAML stream to
     254             :          *        `tokens`.
     255             :          */
     256             :         void scanStart ();
     257             : 
     258             :         /**
     259             :          * @brief This method adds the end markers to the token queue.
     260             :          */
     261             :         void scanEnd ();
     262             : 
     263             :         /**
     264             :          * @brief This method scans a single quoted scalar and adds it to the token
     265             :          *        queue.
     266             :          */
     267             :         void scanSingleQuotedScalar ();
     268             : 
     269             :         /**
     270             :          * @brief This method scans a plain scalar and adds it to the token queue.
     271             :          */
     272             :         void scanPlainScalar ();
     273             : 
     274             :         /**
     275             :          * @brief This method counts the number of non-space characters that can be
     276             :          *        part of a plain scalar at position `offset`.
     277             :          *
     278             :          * @param offset This parameter specifies an offset to the current input
     279             :          *               position, where this function searches for non-space
     280             :          *               characters.
     281             :          *
     282             :          * @return The number of non-space characters at the input position `offset`
     283             :          */
     284             :         size_t countPlainNonSpace (size_t const offset) const;
     285             : 
     286             :         /**
     287             :          * @brief This method counts the number of space characters that can be part
     288             :          *        of a plain scalar at the current input position.
     289             :          *
     290             :          * @return The number of space characters at the current input position
     291             :          */
     292             :         size_t countPlainSpace () const;
     293             : 
     294             :         /**
     295             :          * @brief This method scans a comment and adds it to the token queue.
     296             :          */
     297             :         void scanComment ();
     298             : 
     299             :         /**
     300             :          * @brief This method scans a double quoted scalar and adds it to the token
     301             :          *        queue.
     302             :          */
     303             :         void scanDoubleQuotedScalar ();
     304             : 
     305             :         /**
     306             :          * @brief This method scans a mapping value token and adds it to the token
     307             :          *        queue.
     308             :          */
     309             :         void scanValue ();
     310             : 
     311             :         /**
     312             :          * @brief This method scans a list element token and adds it to the token
     313             :          *        queue.
     314             :          */
     315             :         void scanElement ();
     316             : 
     317             : public:
     318             :         /** This token type starts the YAML stream. */
     319             :         static const size_t STREAM_START = 1;
     320             :         /** This token type ends the YAML stream. */
     321             :         static const size_t STREAM_END = 2;
     322             :         /** This token type specifies that the token stores a (line) comment. */
     323             :         static const size_t COMMENT = 3;
     324             :         /** This token type specifies that the token stores a plain scalar. */
     325             :         static const size_t PLAIN_SCALAR = 4;
     326             :         /** This token type specifies that the token stores a single quoted scalar. */
     327             :         static const size_t SINGLE_QUOTED_SCALAR = 5;
     328             :         /** This token type specifies that the token stores a double quoted scalar. */
     329             :         static const size_t DOUBLE_QUOTED_SCALAR = 6;
     330             :         /** This token type indicates the start of a mapping. */
     331             :         static const size_t MAP_START = 7;
     332             :         /** This token type indicates the end of a mapping. */
     333             :         static const size_t MAP_END = 8;
     334             :         /** This token type indicates the start of a mapping key. */
     335             :         static const size_t KEY = 9;
     336             :         /** This token type indicates the start of a mapping value. */
     337             :         static const size_t VALUE = 10;
     338             :         /** This token type indicates the start of a sequence. */
     339             :         static const size_t SEQUENCE_START = 11;
     340             :         /** This token type indicates the end of a sequence. */
     341             :         static const size_t SEQUENCE_END = 12;
     342             :         /** This token type indicates a list element. */
     343             :         static const size_t ELEMENT = 13;
     344             : 
     345             :         /**
     346             :          * @brief This constructor creates a new YAML lexer for the given input.
     347             :          *
     348             :          * @param stream This character stream stores the data this lexer scans.
     349             :          */
     350             :         YAMLLexer (antlr4::CharStream * stream);
     351             : 
     352             :         /**
     353             :          * @brief This method retrieves the current (not already emitted) token
     354             :          *        produced by the lexer.
     355             :          *
     356             :          * @return A token of the token stream produced by the lexer
     357             :          */
     358             :         unique_ptr<antlr4::Token> nextToken () override;
     359             : 
     360             :         /**
     361             :          * @brief This method retrieves the current line index.
     362             :          *
     363             :          * @return The index of the line the lexer is currently scanning
     364             :          */
     365             :         size_t getLine () const override;
     366             : 
     367             :         /**
     368             :          * @brief This method returns the position in the current line.
     369             :          *
     370             :          * @return The character index in the line the lexer is scanning
     371             :          */
     372             :         size_t getCharPositionInLine () override;
     373             : 
     374             :         /**
     375             :          * @brief This method returns the source the lexer is scanning.
     376             :          *
     377             :          * @return The input of the lexer
     378             :          */
     379             :         antlr4::CharStream * getInputStream () override;
     380             : 
     381             :         /**
     382             :          * @brief This method retrieves the name of the source the lexer is
     383             :          * currently scanning.
     384             :          *
     385             :          * @return The name of the current input source
     386             :          */
     387             :         std::string getSourceName () override;
     388             : 
     389             :         /**
     390             :          * @brief This setter changes the token factory of the lexer.
     391             :          *
     392             :          * @param tokenFactory This parameter specifies the factory that the scanner
     393             :          *                     should use to create tokens.
     394             :          */
     395             :         template <typename T1>
     396             :         void setTokenFactory (antlr4::TokenFactory<T1> * tokenFactory);
     397             : 
     398             :         /**
     399             :          * @brief Retrieve the current token factory.
     400             :          *
     401             :          * @return The factory the scanner uses to create tokens
     402             :          */
     403             :         Ref<antlr4::TokenFactory<antlr4::CommonToken>> getTokenFactory () override;
     404             : };
     405             : }

Generated by: LCOV version 1.13