LCOV - code coverage report
Current view: top level - src/plugins/yanlr - yaml_lexer.cpp (source / functions) Hit Total Coverage
Test: coverage-filtered.info Lines: 153 170 90.0 %
Date: 2019-09-12 12:28:41 Functions: 28 31 90.3 %

          Line data    Source code
       1             : /**
       2             :  * @file
       3             :  *
       4             :  * @brief This file specifies a lexer that scans a subset of YAML.
       5             :  *
       6             :  * The lexer uses the same idea as the scanner of `libyaml` (and various other
       7             :  * YAML libs) to detect simple keys (keys with no `?` prefix).
       8             :  *
       9             :  * For a detailed explanation of the algorithm, I recommend to take a look at
      10             :  * the scanner of
      11             :  *
      12             :  * - SnakeYAML Engine:
      13             :  *   https://bitbucket.org/asomov/snakeyaml-engine
      14             :  * - or LLVM’s YAML library:
      15             :  *   https://github.com/llvm-mirror/llvm/blob/master/lib/Support/YAMLParser.cpp
      16             :  *
      17             :  * .
      18             :  *
      19             :  * @copyright BSD License (see LICENSE.md or https://www.libelektra.org)
      20             :  */
      21             : 
      22             : // -- Imports ------------------------------------------------------------------
      23             : 
      24             : #include "yaml_lexer.hpp"
      25             : 
      26             : using std::make_pair;
      27             : 
      28             : using antlr4::ParseCancellationException;
      29             : 
      30             : // -- Class --------------------------------------------------------------------
      31             : 
      32             : namespace yanlr
      33             : {
      34             : 
      35             : using antlr4::CharStream;
      36             : using antlr4::CommonToken;
      37             : using antlr4::Token;
      38             : using antlr4::TokenFactory;
      39             : 
      40             : /**
      41             :  * @brief This constructor creates a new YAML lexer for the given input.
      42             :  *
      43             :  * @param stream This character stream stores the data this lexer scans.
      44             :  */
      45         310 : YAMLLexer::YAMLLexer (CharStream * stream)
      46             : {
      47          31 :         this->input = stream;
      48          93 :         this->source = make_pair (this, stream);
      49          31 :         scanStart ();
      50          31 : }
      51             : 
      52             : /**
      53             :  * @brief This function checks if the lexer needs to scan additional tokens.
      54             :  *
      55             :  * @retval true If the lexer should fetch additional tokens
      56             :  *         false Otherwise
      57             :  */
      58         881 : bool YAMLLexer::needMoreTokens () const
      59             : {
      60         881 :         if (done) return false;
      61             : 
      62        1412 :         bool keyCandidateExists = simpleKey.first != nullptr;
      63        1223 :         return keyCandidateExists || tokens.empty ();
      64             : }
      65             : 
      66             : /**
      67             :  * @brief This method retrieves the current (not already emitted) token
      68             :  *        produced by the lexer.
      69             :  *
      70             :  * @return A token of the token stream produced by the lexer
      71             :  */
      72         567 : unique_ptr<Token> YAMLLexer::nextToken ()
      73             : {
      74             :         ELEKTRA_LOG_DEBUG ("Retrieve next token");
      75         881 :         while (needMoreTokens ())
      76             :         {
      77         314 :                 fetchTokens ();
      78             : #ifdef HAVE_LOGGER
      79             :                 ELEKTRA_LOG_DEBUG ("Tokens:");
      80             :                 for (unique_ptr<CommonToken> const & token : tokens)
      81             :                 {
      82             :                         ELEKTRA_LOG_DEBUG ("\t %s", token->toString ().c_str ());
      83             :                 }
      84             : #endif
      85             :         }
      86             : 
      87             :         // If `fetchTokens` was unable to retrieve a token (error condition), we emit `EOF`.
      88        1134 :         if (tokens.size () <= 0)
      89             :         {
      90           0 :                 tokens.push_back (commonToken (Token::EOF, getPosition (), input->index (), "end of file"));
      91             :         }
      92        2268 :         unique_ptr<CommonToken> token = move (tokens.front ());
      93         567 :         tokens.pop_front ();
      94         567 :         tokensEmitted++;
      95             :         ELEKTRA_LOG_DEBUG ("Emit token %s", token->toString ().c_str ());
      96        1134 :         return token;
      97             : }
      98             : 
      99             : /**
     100             :  * @brief This method retrieves the current line index.
     101             :  *
     102             :  * @return The index of the line the lexer is currently scanning
     103             :  */
     104         652 : size_t YAMLLexer::getLine () const
     105             : {
     106         652 :         return line;
     107             : }
     108             : 
     109             : /**
     110             :  * @brief This method returns the position in the current line.
     111             :  *
     112             :  * @return The character index in the line the lexer is scanning
     113             :  */
     114         652 : size_t YAMLLexer::getCharPositionInLine ()
     115             : {
     116         652 :         return column;
     117             : }
     118             : 
     119             : /**
     120             :  * @brief This method returns the source the lexer is scanning.
     121             :  *
     122             :  * @return The input of the lexer
     123             :  */
     124           6 : CharStream * YAMLLexer::getInputStream ()
     125             : {
     126           6 :         return input;
     127             : }
     128             : 
     129             : /**
     130             :  * @brief This method retrieves the name of the source the lexer is currently
     131             :  *        scanning.
     132             :  *
     133             :  * @return The name of the current input source
     134             :  */
     135           0 : std::string YAMLLexer::getSourceName ()
     136             : {
     137           0 :         return input->getSourceName ();
     138             : }
     139             : 
     140             : /**
     141             :  * @brief This setter changes the token factory of the lexer.
     142             :  *
     143             :  * @param tokenFactory This parameter specifies the factory that the scanner
     144             :  *                     should use to create tokens.
     145             :  */
     146             : template <typename T1>
     147             : void YAMLLexer::setTokenFactory (TokenFactory<T1> * tokenFactory)
     148             : {
     149             :         factory = tokenFactory;
     150             : }
     151             : 
     152             : /**
     153             :  * @brief Retrieve the current token factory.
     154             :  *
     155             :  * @return The factory the scanner uses to create tokens
     156             :  */
     157           0 : Ref<TokenFactory<CommonToken>> YAMLLexer::getTokenFactory ()
     158             : {
     159           0 :         return factory;
     160             : }
     161             : 
     162             : // ===========
     163             : // = Private =
     164             : // ===========
     165             : 
     166             : /**
     167             :  * @brief This constructor creates a position from the given arguments.
     168             :  *
     169             :  * @param byteIndex This number specifies the byte offset of the position relative to the start of the input.
     170             :  * @param lineNumber This number specifies the line number of the position.
     171             :  * @param columnOffset This number specifies the offset to the beginning of the line.
     172             :  */
     173         660 : YAMLLexer::Position::Position (size_t byteIndex, size_t lineNumber, size_t columnOffset)
     174         660 : : index{ byteIndex }, line{ lineNumber }, column{ columnOffset }
     175             : {
     176         660 : }
     177             : 
     178             : /**
     179             :  * @brief This function returns the current position of the lexer inside the input.
     180             :  *
     181             :  * @return A position containing the current byte index, line number and column offset.
     182             :  */
     183         579 : YAMLLexer::Position YAMLLexer::getPosition ()
     184             : {
     185         579 :         return Position (input->index (), line, column);
     186             : }
     187             : 
     188             : /**
     189             :  * @brief This function creates a new token with the specified parameters.
     190             :  *
     191             :  * @param type This parameter specifies the type of the token this function
     192             :  *             should create.
     193             :  * @param start This variable specifies the start position of the returned token
     194             :  *              inside the character stream `input`.
     195             :  * @param stop This number specifies the stop index of the returned token
     196             :  *             inside the character stream `input`.
     197             :  * @param text This string specifies the text of the returned token.
     198             :  *
     199             :  * @return A token with the specified parameters
     200             :  */
     201         652 : unique_ptr<CommonToken> YAMLLexer::commonToken (size_t type, Position const & start, size_t stop, string text = "")
     202             : #if defined(__clang__)
     203             :         // Ignore warning about call on pointer of wrong object type (`CommonTokenFactory` instead of `TokenFactory<CommonToken>`)
     204             :         // This should not be a problem, since `CommonTokenFactory` inherits from `TokenFactory<CommonToken>`.
     205             :         __attribute__ ((no_sanitize ("undefined")))
     206             : #endif
     207             : {
     208         652 :         return factory->create (source, type, text, Token::DEFAULT_CHANNEL, start.index, stop, start.line, start.column);
     209             : }
     210             : 
     211             : /**
     212             :  * @brief This function adds an indentation value if the given value is smaller
     213             :  *        than the current indentation.
     214             :  *
     215             :  * @param lineIndex This parameter specifies the indentation value that this
     216             :  *                  function compares to the current indentation.
     217             :  *
     218             :  * @param type This value specifies the block collection type that
     219             :  *             `lineIndex` might start.
     220             :  *
     221             :  * @retval true If the function added an indentation value
     222             :  *         false Otherwise
     223             :  */
     224         112 : bool YAMLLexer::addIndentation (size_t const lineIndex, Level::Type type)
     225             : {
     226         224 :         if (lineIndex > levels.top ().indent)
     227             :         {
     228             :                 ELEKTRA_LOG_DEBUG ("Add indentation %zu", lineIndex);
     229         110 :                 levels.push (Level{ lineIndex, type });
     230          55 :                 return true;
     231             :         }
     232             :         return false;
     233             : }
     234             : 
     235             : /**
     236             :  * @brief This method adds new tokens to the token stream.
     237             :  */
     238         314 : void YAMLLexer::fetchTokens ()
     239             : {
     240         314 :         scanToNextToken ();
     241             : 
     242         314 :         addBlockEnd (column);
     243             : 
     244         314 :         if (input->LA (1) == Token::EOF)
     245             :         {
     246          31 :                 scanEnd ();
     247          31 :                 return;
     248             :         }
     249         283 :         else if (isValue ())
     250             :         {
     251          81 :                 scanValue ();
     252          81 :                 return;
     253             :         }
     254         202 :         else if (isElement ())
     255             :         {
     256          31 :                 scanElement ();
     257          31 :                 return;
     258             :         }
     259         171 :         else if (input->LA (1) == '"')
     260             :         {
     261          57 :                 scanDoubleQuotedScalar ();
     262          57 :                 return;
     263             :         }
     264         114 :         else if (input->LA (1) == '\'')
     265             :         {
     266           0 :                 scanSingleQuotedScalar ();
     267           0 :                 return;
     268             :         }
     269         114 :         else if (input->LA (1) == '#')
     270             :         {
     271           5 :                 scanComment ();
     272           5 :                 return;
     273             :         }
     274             : 
     275         109 :         scanPlainScalar ();
     276             : }
     277             : 
     278             : /**
     279             :  * @brief This method consumes characters from the input stream keeping
     280             :  *        track of line and column numbers.
     281             :  *
     282             :  * @param characters This parameter specifies the number of characters the
     283             :  *                   the function should consume.
     284             :  */
     285        1222 : void YAMLLexer::forward (size_t const characters = 1)
     286             : {
     287             :         ELEKTRA_LOG_DEBUG ("Forward %zu characters", characters);
     288             : 
     289        3096 :         for (size_t charsLeft = characters; charsLeft > 0; charsLeft--)
     290             :         {
     291        1876 :                 if (input->LA (1) == Token::EOF)
     292             :                 {
     293             :                         ELEKTRA_LOG_DEBUG ("Hit EOF!");
     294             :                         return;
     295             :                 }
     296             : 
     297        1874 :                 column++;
     298        1874 :                 if (input->LA (1) == '\n')
     299             :                 {
     300         151 :                         column = 1;
     301         151 :                         line++;
     302             :                 }
     303        1874 :                 input->consume ();
     304             :         }
     305             : }
     306             : 
     307             : /**
     308             :  * @brief This method removes uninteresting characters from the input.
     309             :  */
     310         314 : void YAMLLexer::scanToNextToken ()
     311             : {
     312             :         ELEKTRA_LOG_DEBUG ("Scan to next token");
     313         314 :         bool found = false;
     314         706 :         while (!found)
     315             :         {
     316         908 :                 while (input->LA (1) == ' ')
     317             :                 {
     318         258 :                         forward ();
     319             :                 }
     320             :                 ELEKTRA_LOG_DEBUG ("Skipped whitespace");
     321         392 :                 if (input->LA (1) == '\n')
     322             :                 {
     323          78 :                         forward ();
     324             :                         ELEKTRA_LOG_DEBUG ("Skipped newline");
     325             :                 }
     326             :                 else
     327             :                 {
     328             :                         found = true;
     329             :                         ELEKTRA_LOG_DEBUG ("Found next token");
     330             :                 }
     331             :         }
     332         314 : }
     333             : 
     334             : /**
     335             :  * @brief This method checks if the input at the specified offset starts a key
     336             :  *        value token.
     337             :  *
     338             :  * @param offset This parameter specifies an offset to the current position,
     339             :  *               where this function will look for a key value token.
     340             :  *
     341             :  * @retval true If the input matches a key value token
     342             :  *         false Otherwise
     343             :  */
     344        1090 : bool YAMLLexer::isValue (size_t const offset) const
     345             : {
     346        1322 :         return (input->LA (offset) == ':') &&
     347         275 :                (input->LA (offset + 1) == '\n' || input->LA (offset + 1) == ' ' || input->LA (offset + 1) == Token::EOF);
     348             : }
     349             : 
     350             : /**
     351             :  * @brief This method checks if the current input starts a list element.
     352             :  *
     353             :  * @retval true If the input matches a list element token
     354             :  *         false Otherwise
     355             :  */
     356         202 : bool YAMLLexer::isElement () const
     357             : {
     358         202 :         return (input->LA (1) == '-') && (input->LA (2) == '\n' || input->LA (2) == ' ');
     359             : }
     360             : 
     361             : /**
     362             :  * @brief This method checks if the input at the specified offset starts a line
     363             :  *        comment.
     364             :  *
     365             :  * @param offset This parameter specifies an offset to the current position,
     366             :  *               where this function will look for a comment token.
     367             :  *
     368             :  * @retval true If the input matches a comment token
     369             :  *         false Otherwise
     370             :  */
     371         656 : bool YAMLLexer::isComment (size_t const offset) const
     372             : {
     373         656 :         return (input->LA (offset) == '#') && (input->LA (offset + 1) == '\n' || input->LA (offset + 1) == ' ');
     374             : }
     375             : 
     376             : /**
     377             :  * @brief This method saves a token for a simple key candidate located at the
     378             :  *        current input position.
     379             :  */
     380         166 : void YAMLLexer::addSimpleKeyCandidate ()
     381             : {
     382         332 :         size_t position = tokens.size () + tokensEmitted;
     383        1162 :         simpleKey = make_pair (commonToken (KEY, getPosition (), input->index (), "KEY"), position);
     384         166 : }
     385             : 
     386             : /**
     387             :  * @brief This method adds block closing tokens to the token queue, if the
     388             :  *        indentation decreased.
     389             :  *
     390             :  * @param lineIndex This parameter specifies the column (indentation in number
     391             :  *                  of spaces) for which this method should add block end
     392             :  *                  tokens.
     393             :  */
     394         345 : void YAMLLexer::addBlockEnd (size_t const lineIndex)
     395             : {
     396         800 :         while (lineIndex < levels.top ().indent)
     397             :         {
     398             :                 ELEKTRA_LOG_DEBUG ("Add block end");
     399          55 :                 size_t index = input->index ();
     400         330 :                 tokens.push_back (levels.top ().type == Level::Type::MAP ?
     401          42 :                                           commonToken (MAP_END, getPosition (), index, "end of map") :
     402          68 :                                           commonToken (SEQUENCE_END, getPosition (), index, "end of sequence"));
     403          55 :                 levels.pop ();
     404             :         }
     405         345 : }
     406             : 
     407             : /**
     408             :  * @brief This method adds the token for the start of the YAML stream to
     409             :  *        `tokens`.
     410             :  */
     411          31 : void YAMLLexer::scanStart ()
     412             : {
     413             :         ELEKTRA_LOG_DEBUG ("Scan start");
     414         155 :         auto start = commonToken (STREAM_START, getPosition (), input->index (), "start of document");
     415          62 :         tokens.push_back (move (start));
     416          31 : }
     417             : 
     418             : /**
     419             :  * @brief This method adds the end markers to the token queue.
     420             :  */
     421          31 : void YAMLLexer::scanEnd ()
     422             : {
     423          31 :         addBlockEnd (0);
     424          31 :         auto start = getPosition ();
     425         155 :         tokens.push_back (commonToken (STREAM_END, start, input->index (), "end of document"));
     426         155 :         tokens.push_back (commonToken (Token::EOF, start, input->index (), "end of file"));
     427          31 :         done = true;
     428          31 : }
     429             : 
     430             : /**
     431             :  * @brief This method scans a single quoted scalar and adds it to the token
     432             :  *        queue.
     433             :  */
     434           0 : void YAMLLexer::scanSingleQuotedScalar ()
     435             : {
     436             :         ELEKTRA_LOG_DEBUG ("Scan single quoted scalar");
     437             : 
     438           0 :         auto start = getPosition ();
     439             :         // A single quoted scalar can start a simple key
     440           0 :         addSimpleKeyCandidate ();
     441             : 
     442           0 :         forward (); // Include initial single quote
     443           0 :         while (input->LA (1) != '\'' || input->LA (2) == '\'')
     444             :         {
     445           0 :                 forward ();
     446             :         }
     447           0 :         forward (); // Include closing single quote
     448           0 :         tokens.push_back (commonToken (SINGLE_QUOTED_SCALAR, start, input->index () - 1));
     449           0 : }
     450             : 
     451             : /**
     452             :  * @brief This method scans a double quoted scalar and adds it to the token
     453             :  *        queue.
     454             :  */
     455          57 : void YAMLLexer::scanDoubleQuotedScalar ()
     456             : {
     457             :         ELEKTRA_LOG_DEBUG ("Scan double quoted scalar");
     458          57 :         auto start = getPosition ();
     459             : 
     460             :         // A double quoted scalar can start a simple key
     461          57 :         addSimpleKeyCandidate ();
     462             : 
     463          57 :         forward (); // Include initial double quote
     464         504 :         while (input->LA (1) != '"')
     465             :         {
     466         447 :                 forward ();
     467             :         }
     468          57 :         forward (); // Include closing double quote
     469         285 :         tokens.push_back (commonToken (DOUBLE_QUOTED_SCALAR, start, input->index () - 1));
     470          57 : }
     471             : 
     472             : /**
     473             :  * @brief This method scans a plain scalar and adds it to the token queue.
     474             :  */
     475         109 : void YAMLLexer::scanPlainScalar ()
     476             : {
     477             :         ELEKTRA_LOG_DEBUG ("Scan plain scalar");
     478         109 :         auto start = getPosition ();
     479             :         // A plain scalar can start a simple key
     480         109 :         addSimpleKeyCandidate ();
     481             : 
     482             :         size_t lengthSpace = 0;
     483             :         size_t lengthNonSpace = 0;
     484             :         while (true)
     485             :         {
     486         234 :                 lengthNonSpace = countPlainNonSpace (lengthSpace);
     487         234 :                 if (lengthNonSpace == 0)
     488             :                 {
     489             :                         break;
     490             :                 }
     491         125 :                 forward (lengthSpace + lengthNonSpace);
     492         125 :                 lengthSpace = countPlainSpace ();
     493             :         }
     494             : 
     495         545 :         tokens.push_back (commonToken (PLAIN_SCALAR, start, input->index () - 1));
     496         109 : }
     497             : 
     498             : /**
     499             :  * @brief This method counts the number of non-space characters that can be part
     500             :  *        of a plain scalar at position `offset`.
     501             :  *
     502             :  * @param offset This parameter specifies an offset to the current input
     503             :  *               position, where this function searches for non-space
     504             :  *               characters.
     505             :  *
     506             :  * @return The number of non-space characters at the input position `offset`
     507             :  */
     508         234 : size_t YAMLLexer::countPlainNonSpace (size_t const offset) const
     509             : {
     510             :         ELEKTRA_LOG_DEBUG ("Scan non-space characters");
     511         702 :         string const stop = " \n";
     512             : 
     513         234 :         size_t lookahead = offset + 1;
     514        2192 :         while (stop.find (input->LA (lookahead)) == string::npos && input->LA (lookahead) != Token::EOF && !isValue (lookahead) &&
     515         656 :                !isComment (lookahead))
     516             :         {
     517         651 :                 lookahead++;
     518             :         }
     519             : 
     520             :         ELEKTRA_LOG_DEBUG ("Found %zu non-space characters", lookahead - offset - 1);
     521         468 :         return lookahead - offset - 1;
     522             : }
     523             : 
     524             : /**
     525             :  * @brief This method counts the number of space characters that can be part
     526             :  *        of a plain scalar at the current input position.
     527             :  *
     528             :  * @return The number of space characters at the current input position
     529             :  */
     530         125 : size_t YAMLLexer::countPlainSpace () const
     531             : {
     532             :         ELEKTRA_LOG_DEBUG ("Scan spaces");
     533         125 :         size_t lookahead = 1;
     534         181 :         while (input->LA (lookahead) == ' ')
     535             :         {
     536          28 :                 lookahead++;
     537             :         }
     538             :         ELEKTRA_LOG_DEBUG ("Found %zu space characters", lookahead - 1);
     539         125 :         return lookahead - 1;
     540             : }
     541             : 
     542             : /**
     543             :  * @brief This method scans a comment and adds it to the token queue.
     544             :  */
     545           5 : void YAMLLexer::scanComment ()
     546             : {
     547             :         ELEKTRA_LOG_DEBUG ("Scan comment");
     548           5 :         auto start = getPosition ();
     549             : 
     550          93 :         while (input->LA (1) != '\n' && input->LA (1) != Token::EOF)
     551             :         {
     552          88 :                 forward ();
     553             :         }
     554          25 :         tokens.push_back (commonToken (COMMENT, start, input->index () - 1));
     555           5 : }
     556             : 
     557             : /**
     558             :  * @brief This method scans a mapping value token and adds it to the token
     559             :  *        queue.
     560             :  */
     561          81 : void YAMLLexer::scanValue ()
     562             : {
     563             :         ELEKTRA_LOG_DEBUG ("Scan value");
     564         405 :         tokens.push_back (commonToken (VALUE, getPosition (), input->index () + 1));
     565          81 :         forward (input->LA (1) == Token::EOF ? 1 : 2);
     566         162 :         if (simpleKey.first == nullptr)
     567             :         {
     568           0 :                 throw ParseCancellationException ("Unable to locate key for value");
     569             :         }
     570             :         auto const start =
     571         324 :                 Position{ simpleKey.first->getStartIndex (), simpleKey.first->getLine (), simpleKey.first->getCharPositionInLine () };
     572          81 :         size_t offset = simpleKey.second - tokensEmitted;
     573         324 :         tokens.insert (tokens.begin () + offset, move (simpleKey.first));
     574          81 :         if (addIndentation (start.column, Level::Type::MAP))
     575             :         {
     576         294 :                 tokens.insert (tokens.begin () + offset, commonToken (MAP_START, start, start.index, "start of map"));
     577             :         }
     578          81 : }
     579             : 
     580             : /**
     581             :  * @brief This method scans a list element token and adds it to the token
     582             :  *        queue.
     583             :  */
     584          31 : void YAMLLexer::scanElement ()
     585             : {
     586             :         ELEKTRA_LOG_DEBUG ("Scan element");
     587          31 :         if (addIndentation (column, Level::Type::SEQUENCE))
     588             :         {
     589          65 :                 tokens.push_back (commonToken (SEQUENCE_START, getPosition (), column, "start of sequence"));
     590             :         }
     591         155 :         tokens.push_back (commonToken (ELEMENT, getPosition (), input->index () + 1));
     592          31 :         forward (2);
     593          31 : }
     594         140 : }

Generated by: LCOV version 1.13