Line data Source code
1 : /**
2 : * @file
3 : *
4 : * @brief This file specifies a lexer that scans a subset of YAML.
5 : *
6 : * The lexer uses the same idea as the scanner of `libyaml` (and various other
7 : * YAML libs) to detect simple keys (keys with no `?` prefix).
8 : *
9 : * For a detailed explanation of the algorithm, I recommend to take a look at
10 : * the scanner of
11 : *
12 : * - SnakeYAML Engine:
13 : * https://bitbucket.org/asomov/snakeyaml-engine
14 : * - or LLVM’s YAML library:
15 : * https://github.com/llvm-mirror/llvm/blob/master/lib/Support/YAMLParser.cpp
16 : *
17 : * .
18 : *
19 : * @copyright BSD License (see LICENSE.md or https://www.libelektra.org)
20 : */
21 :
22 : // -- Imports ------------------------------------------------------------------
23 :
24 : #include <antlr4-runtime.h>
25 :
26 : #include <kdblogger.h>
27 :
28 : using std::deque;
29 : using std::pair;
30 : using std::shared_ptr;
31 : using std::stack;
32 : using std::string;
33 : using std::unique_ptr;
34 :
35 : namespace yanlr
36 : {
37 :
38 : // -- Class --------------------------------------------------------------------
39 :
40 124 : class YAMLLexer : public antlr4::TokenSource
41 : {
42 : /** This class stores information about indentation that starts a new block node. */
43 : class Level
44 : {
45 : public:
46 : /** This enumeration specifies the type of a block node. */
47 : enum class Type
48 : {
49 : MAP, ///< The current indentation starts a block map
50 : SEQUENCE, ///< The current indentation starts a block sequence
51 : OTHER ///< The current indentation starts a block scalar
52 : };
53 : size_t indent = 0;
54 : Type type = Level::Type::OTHER;
55 :
56 : /**
57 : * @brief This constructor creates a level object from the given arguments.
58 : *
59 : * @param indentation This number specifies the number of spaces used to start this level object.
60 : * @param levelType This argument specifies the type of node `indentation` created.
61 : */
62 86 : Level (size_t indentation, Level::Type levelType = Level::Type::OTHER) : indent{ indentation }, type{ levelType }
63 : {
64 : }
65 : };
66 :
67 : /** This structure represents the position inside the input. */
68 : struct Position
69 : {
70 : /** This parameter stores the offset to the start of the input in bytes. */
71 : size_t index;
72 : /** This parameter stores the line number. */
73 : size_t line;
74 : /** This parameter stores the column offset inside `line`. */
75 : size_t column;
76 :
77 : /**
78 : * @brief This constructor creates a position from the given arguments.
79 : *
80 : * @param byteIndex This number specifies the byte offset of the position relative to the start of the input.
81 : * @param lineNumber This number specifies the line number of the position.
82 : * @param columnOffset This number specifies the offset to the beginning of the line.
83 : */
84 : Position (size_t byteIndex, size_t lineNumber, size_t columnOffset);
85 : };
86 :
87 : /** This variable stores the input that this lexer scans. */
88 : antlr4::CharStream * input;
89 :
90 : /** This queue stores the list of tokens produced by the lexer. */
91 : deque<unique_ptr<antlr4::CommonToken>> tokens;
92 :
93 : /** The lexer uses this factory to produce tokens. */
94 : Ref<antlr4::TokenFactory<antlr4::CommonToken>> factory = antlr4::CommonTokenFactory::DEFAULT;
95 :
96 : /** This pair stores the token source (this lexer) and the current `input`. */
97 : pair<antlr4::TokenSource *, antlr4::CharStream *> source;
98 :
99 : /**
100 : * This variable saves the current line position of the lexer inside
101 : * `input`.
102 : */
103 : size_t line = 1;
104 :
105 : /**
106 : * This number stores the current character position of the lexer inside of
107 : * `line`.
108 : */
109 : size_t column = 1;
110 :
111 : /**
112 : * This counter stores the number of tokens already emitted by the lexer.
113 : * The lexer needs this variable, to keep track of the insertion point of
114 : * `KEY` tokens in the token queue.
115 : */
116 : size_t tokensEmitted = 0;
117 :
118 : /**
119 : * This stack stores the indentation (in number of characters) and block
120 : * type for each block node.
121 : */
122 : stack<Level> levels{ deque<Level>{ Level{ 0 } } };
123 :
124 : /**
125 : * This boolean specifies if the lexer has already scanned the whole input or
126 : * not.
127 : */
128 : bool done = false;
129 :
130 : /**
131 : * This pair stores a simple key candidate token (first part) and its
132 : * position in the token queue (second part).
133 : *
134 : * Since the lexer only supports block syntax for mappings and sequences we
135 : * use a single token here. If we need support for flow collections we have
136 : * to store a candidate for each flow level (block context = flow level 0).
137 : */
138 : pair<unique_ptr<antlr4::CommonToken>, size_t> simpleKey;
139 :
140 : /**
141 : * @brief This function returns the current position of the lexer inside the input.
142 : *
143 : * @return A position containing the current byte index, line number and column offset.
144 : */
145 : Position getPosition ();
146 :
147 : /**
148 : * @brief This function creates a new token with the specified parameters.
149 : *
150 : * @param type This parameter specifies the type of the token this function
151 : * should create.
152 : * @param start This variable specifies the start position of the returned token
153 : * inside the character stream `input`.
154 : * @param stop This number specifies the stop index of the returned token
155 : * inside the character stream `input`.
156 : * @param text This string specifies the text of the returned token.
157 : *
158 : * @return A token with the specified parameters
159 : */
160 : unique_ptr<antlr4::CommonToken> commonToken (size_t type, Position const & start, size_t stop, string text);
161 :
162 : /**
163 : * @brief This function adds an indentation value if the given value is smaller
164 : * than the current indentation.
165 : *
166 : * @param lineIndex This parameter specifies the indentation value that this
167 : * function compares to the current indentation.
168 : *
169 : * @param type This value specifies the block collection type that
170 : * `lineIndex` might start.
171 : *
172 : * @retval true If the function added an indentation value
173 : * false Otherwise
174 : */
175 : bool addIndentation (size_t const column, Level::Type type);
176 :
177 : /**
178 : * @brief This function checks if the lexer needs to scan additional tokens.
179 : *
180 : * @retval true If the lexer should fetch additional tokens
181 : * false Otherwise
182 : */
183 : bool needMoreTokens () const;
184 :
185 : /**
186 : * @brief This method adds new tokens to the token stream.
187 : */
188 : void fetchTokens ();
189 :
190 : /**
191 : * @brief This method consumes characters from the input stream keeping
192 : * track of line and column numbers.
193 : *
194 : * @param characters This parameter specifies the number of characters the
195 : * the function should consume.
196 : */
197 : void forward (size_t const characters);
198 :
199 : /**
200 : * @brief This method removes uninteresting characters from the input.
201 : */
202 : void scanToNextToken ();
203 :
204 : /**
205 : * @brief This method checks if the input at the specified offset starts a key
206 : * value token.
207 : *
208 : * @param offset This parameter specifies an offset to the current position,
209 : * where this function will look for a key value token.
210 : *
211 : * @retval true If the input matches a key value token
212 : * false Otherwise
213 : */
214 : bool isValue (size_t const offset = 1) const;
215 :
216 : /**
217 : * @brief This method checks if the current input starts a list element.
218 : *
219 : * @retval true If the input matches a list element token
220 : * false Otherwise
221 : */
222 : bool isElement () const;
223 :
224 : /**
225 : * @brief This method checks if the input at the specified offset starts a
226 : * line comment.
227 : *
228 : * @param offset This parameter specifies an offset to the current position,
229 : * where this function will look for a comment token.
230 : *
231 : * @retval true If the input matches a comment token
232 : * false Otherwise
233 : */
234 : bool isComment (size_t const offset) const;
235 :
236 : /**
237 : * @brief This method saves a token for a simple key candidate located at the
238 : * current input position.
239 : */
240 : void addSimpleKeyCandidate ();
241 :
242 : /**
243 : * @brief This method adds block closing tokens to the token queue, if the
244 : * indentation decreased.
245 : *
246 : * @param lineIndex This parameter specifies the column (indentation in number
247 : * of spaces) for which this method should add block end
248 : * tokens.
249 : */
250 : void addBlockEnd (size_t const lineIndex);
251 :
252 : /**
253 : * @brief This method adds the token for the start of the YAML stream to
254 : * `tokens`.
255 : */
256 : void scanStart ();
257 :
258 : /**
259 : * @brief This method adds the end markers to the token queue.
260 : */
261 : void scanEnd ();
262 :
263 : /**
264 : * @brief This method scans a single quoted scalar and adds it to the token
265 : * queue.
266 : */
267 : void scanSingleQuotedScalar ();
268 :
269 : /**
270 : * @brief This method scans a plain scalar and adds it to the token queue.
271 : */
272 : void scanPlainScalar ();
273 :
274 : /**
275 : * @brief This method counts the number of non-space characters that can be
276 : * part of a plain scalar at position `offset`.
277 : *
278 : * @param offset This parameter specifies an offset to the current input
279 : * position, where this function searches for non-space
280 : * characters.
281 : *
282 : * @return The number of non-space characters at the input position `offset`
283 : */
284 : size_t countPlainNonSpace (size_t const offset) const;
285 :
286 : /**
287 : * @brief This method counts the number of space characters that can be part
288 : * of a plain scalar at the current input position.
289 : *
290 : * @return The number of space characters at the current input position
291 : */
292 : size_t countPlainSpace () const;
293 :
294 : /**
295 : * @brief This method scans a comment and adds it to the token queue.
296 : */
297 : void scanComment ();
298 :
299 : /**
300 : * @brief This method scans a double quoted scalar and adds it to the token
301 : * queue.
302 : */
303 : void scanDoubleQuotedScalar ();
304 :
305 : /**
306 : * @brief This method scans a mapping value token and adds it to the token
307 : * queue.
308 : */
309 : void scanValue ();
310 :
311 : /**
312 : * @brief This method scans a list element token and adds it to the token
313 : * queue.
314 : */
315 : void scanElement ();
316 :
317 : public:
318 : /** This token type starts the YAML stream. */
319 : static const size_t STREAM_START = 1;
320 : /** This token type ends the YAML stream. */
321 : static const size_t STREAM_END = 2;
322 : /** This token type specifies that the token stores a (line) comment. */
323 : static const size_t COMMENT = 3;
324 : /** This token type specifies that the token stores a plain scalar. */
325 : static const size_t PLAIN_SCALAR = 4;
326 : /** This token type specifies that the token stores a single quoted scalar. */
327 : static const size_t SINGLE_QUOTED_SCALAR = 5;
328 : /** This token type specifies that the token stores a double quoted scalar. */
329 : static const size_t DOUBLE_QUOTED_SCALAR = 6;
330 : /** This token type indicates the start of a mapping. */
331 : static const size_t MAP_START = 7;
332 : /** This token type indicates the end of a mapping. */
333 : static const size_t MAP_END = 8;
334 : /** This token type indicates the start of a mapping key. */
335 : static const size_t KEY = 9;
336 : /** This token type indicates the start of a mapping value. */
337 : static const size_t VALUE = 10;
338 : /** This token type indicates the start of a sequence. */
339 : static const size_t SEQUENCE_START = 11;
340 : /** This token type indicates the end of a sequence. */
341 : static const size_t SEQUENCE_END = 12;
342 : /** This token type indicates a list element. */
343 : static const size_t ELEMENT = 13;
344 :
345 : /**
346 : * @brief This constructor creates a new YAML lexer for the given input.
347 : *
348 : * @param stream This character stream stores the data this lexer scans.
349 : */
350 : YAMLLexer (antlr4::CharStream * stream);
351 :
352 : /**
353 : * @brief This method retrieves the current (not already emitted) token
354 : * produced by the lexer.
355 : *
356 : * @return A token of the token stream produced by the lexer
357 : */
358 : unique_ptr<antlr4::Token> nextToken () override;
359 :
360 : /**
361 : * @brief This method retrieves the current line index.
362 : *
363 : * @return The index of the line the lexer is currently scanning
364 : */
365 : size_t getLine () const override;
366 :
367 : /**
368 : * @brief This method returns the position in the current line.
369 : *
370 : * @return The character index in the line the lexer is scanning
371 : */
372 : size_t getCharPositionInLine () override;
373 :
374 : /**
375 : * @brief This method returns the source the lexer is scanning.
376 : *
377 : * @return The input of the lexer
378 : */
379 : antlr4::CharStream * getInputStream () override;
380 :
381 : /**
382 : * @brief This method retrieves the name of the source the lexer is
383 : * currently scanning.
384 : *
385 : * @return The name of the current input source
386 : */
387 : std::string getSourceName () override;
388 :
389 : /**
390 : * @brief This setter changes the token factory of the lexer.
391 : *
392 : * @param tokenFactory This parameter specifies the factory that the scanner
393 : * should use to create tokens.
394 : */
395 : template <typename T1>
396 : void setTokenFactory (antlr4::TokenFactory<T1> * tokenFactory);
397 :
398 : /**
399 : * @brief Retrieve the current token factory.
400 : *
401 : * @return The factory the scanner uses to create tokens
402 : */
403 : Ref<antlr4::TokenFactory<antlr4::CommonToken>> getTokenFactory () override;
404 : };
405 : }
|