source: rtems-central/formal/promela/src/src/modules/comment_filter/comment_filter/rfc.py @ a64e772

Last change on this file since a64e772 was a64e772, checked in by Andrew Butterfield <Andrew.Butterfield@…>, on 01/13/23 at 15:41:01

modifications made to comment_filter

  • Property mode set to 100644
File size: 12.6 KB
Line 
1import re
2
3class State:
4    """Parser State"""
5    def __init__(self, line='', multi_end_stack=None, in_literal=None):
6        # The remaining input.
7        self.line = line
8
9        # A stack of end tokens for multi-line comments.  The token on the
10        # top of the stack is the expected end token for the most nested
11        # multi-line comment.
12        self.multi_end_stack = multi_end_stack or []
13
14        # If the parser is waiting on the end quote, in_literal will be
15        # string the parser is waiting for.
16        self.in_literal = in_literal
17
18    def __eq__(self, x):
19        # Return True if all members are equal.
20        return self.line == x.line and self.multi_end_stack == x.multi_end_stack \
21            and self.in_literal == x.in_literal
22
23
24def parse_file(lang, file_obj, code_only=False, keep_tokens=True):
25    """
26    Return a generator that yields a filtered line for
27    each line in file_obj.
28
29    Args:
30      lang (dictionary):
31        Syntax description for the language being parsed.
32      file_obj (iterator<string>):
33        An iterater that yields lines.
34      code_only (bool, default: False):
35        If False, each non-comment character is replaced with a space.
36        If True, each comment character is replaced with a space.
37      keep_tokens (bool, default: True):
38        If False, comment tokens are filtered out.
39        If True, comment tokens are preserved.
40
41    Returns:
42      iterator<string>
43    """
44    state = State()
45    for line in file_obj:
46        state.line = line
47        line, state = parse_line(lang, state, code_only, keep_tokens)
48        yield line
49
50
51def parse_line(lang, state, code_only=False, keep_tokens=True):
52    """
53    Return the comments or code of state.line.
54
55    The output string will be the same length as the input string.
56    Filtered out characters are represented as spaces.
57
58    Args:
59      lang (Language):
60        Syntax description for the language being parsed.
61      state (State):
62        Parser state.
63      code_only (bool, default: False):
64        If False, each non-comment character is replaced with a space.
65        If True, each comment character is replaced with a space.
66      keep_tokens (bool, default: True):
67        If False, comment tokens are filtered out.
68        If True, comment tokens are preserved.
69
70    Returns:
71      (string, State)
72    """
73    # If currently within a string literal or multi-line comment, first
74    # complete parsing that declaration.  Store the result in 'rest_of_decl'.
75    rest_of_decl = ''
76    rest_of_decl0 = ''
77    if state.in_literal:
78        # Parsing a string literal.
79        cnts, state = finish_string_literal(state.in_literal, state)
80        if code_only:
81            rest_of_decl = cnts
82        else:
83            rest_of_decl = clear_line(cnts)
84    elif state.multi_end_stack:
85        # If there is state, we assume it is because we have parsed
86        # the start of a multiline comment, but haven't found the end.
87        cmt, cmt0, state = finish_multiline_comment(lang, state, keep_tokens)
88        if code_only:
89            rest_of_decl = clear_line(cmt)
90            rest_of_decl0 = clear_line(cmt0)
91        else:
92            rest_of_decl = cmt
93            rest_of_decl0 = cmt0
94
95    if state.in_literal or state.multi_end_stack:
96        return [rest_of_decl + rest_of_decl0], state
97
98    decls, state = parse_declarations(lang, state, code_only, keep_tokens)
99    return [rest_of_decl, rest_of_decl0] + decls, state
100
101
102def parse_declarations(lang, state, code_only=False, keep_tokens=True):
103    """
104    Return the comments or code of state.line.
105
106    Unlike parse_line, this function assumes the parser is *not*
107    in the context of a multi-line comment.
108
109    Args:
110      lang (Language):
111        Syntax description for the language being parsed.
112      state (State):
113        Parser state.
114      code_only (bool, default: False):
115        If False, each non-comment character is replaced with a space.
116        If True, each comment character is replaced with a space.
117      keep_tokens (bool, default: True):
118        If False, comment tokens are filtered out.
119        If True, comment tokens are preserved.
120
121    Returns:
122      (string, State)
123    """
124    code, state = parse_code(lang, state)
125    comment, state = parse_line_comment(lang, state, keep_tokens)
126    comment2, comment2_0, state = parse_multiline_comment(lang, state, keep_tokens)
127
128    if comment or comment2 or comment2_0:
129        line = [state.line]
130        if not state.multi_end_stack:
131            # Continue looking for declarations.
132            line, state = parse_declarations(lang, state, code_only, keep_tokens)
133        if code_only:
134            line = [code, clear_line(comment), clear_line(comment2), clear_line(comment2_0)] + line
135        else:
136            line = [clear_line(code), comment, comment2, comment2_0] + line
137        return line, state
138    else:
139        state.line = ''
140        if code_only:
141            return [code], state
142        else:
143            return [clear_line(code)], state
144
145
146def parse_code(lang, state):
147    """
148    Returns all characters up to the first comment.
149
150    Args:
151      lang (Language):
152        Syntax description for the language being parsed.
153      state (State):
154        Parser state.
155
156    Returns:
157      (string, State)
158    """
159    code = ''
160    while True:
161        line = state.line
162        multi_start_tokens = [start for start, end in lang.comment_bookends]
163        tokens = multi_start_tokens + lang.line_comment + [
164            lang.string_literal_start,
165            lang.string_literal2_start]
166        i = index_of_first_found(line, tokens)
167        if i != -1:
168            state.line = line[i:]
169            code += line[:i]
170            if [ () for cmt in lang.line_comment if line.startswith(cmt, i) ] or \
171                    index_of_first_found(line, multi_start_tokens) == i:
172                return code, state
173            elif line.startswith(lang.string_literal_start, i):
174                lit, state = parse_string_literal(
175                    lang.string_literal_start, state)
176                code += lit
177                continue
178            else:
179                lit, state = parse_string_literal(
180                    lang.string_literal2_start, state)
181                code += lit
182                continue
183        else:
184            state.line = ''
185            return code + line, state
186
187
188def parse_string_literal(quote, state):
189    """
190    Returns the string literal at the beginning of state.line,
191    otherwise the empty string.
192
193    Args:
194      quote (string):
195        The syntax for the start and end quote.
196      state (State):
197        Parser state.
198
199    Returns:
200      (string, State)
201    """
202    if state.line.startswith(quote):
203        state.line = state.line[len(quote):]
204        line, state = finish_string_literal(quote, state)
205        return quote + line, state
206    else:
207        return '', state
208
209
210def finish_string_literal(quote, state):
211    cnts, state = parse_string_literal_contents(quote, state)
212    if state.line.startswith(quote):
213        state.line = state.line[len(quote):]
214        state.in_literal = None
215        return cnts + quote, state
216    else:
217        # No end-quote yet.
218        state.in_literal = quote
219        return cnts, state
220
221
222def parse_string_literal_contents(quote, state):
223    """
224    Returns the string literal contents at the beginning of state.line.
225    The end quote is not included.
226
227    Args:
228      quote (string):
229        The syntax for the end quote.
230      state (State):
231        Parser state.
232
233    Returns:
234      (string, State)
235    """
236    contents = ''
237    escaped_quote = '\\' + quote
238    while True:
239        i = index_of_first_found(state.line, [quote, escaped_quote])
240        if i != -1:
241            if state.line.startswith(quote, i):
242                contents += state.line[:i]
243                state.line = state.line[i:]
244                return contents, state
245            else:
246                # Escaped quote.
247                i += len(escaped_quote)
248                contents += state.line[:i]
249                state.line = state.line[i:]
250                continue
251        else:
252            # No end-quote.  Chew up the whole line.
253            contents += state.line
254            state.line = ''
255            return contents, state
256
257
258def parse_line_comment(lang, state, keep_tokens=True):
259    """
260    Returns the single-line comment at the beginning of state.line,
261    otherwise the empty string.
262
263    Args:
264      lang (Language):
265        Syntax description for the language being parsed.
266      state (State):
267        Parser state
268      keep_tokens (bool, default: True):
269        If False, comment tokens are filtered out.
270        If True, comment tokens are preserved.
271
272    Returns:
273      (string, State)
274    """
275    line = state.line
276    for line_comment in lang.line_comment:
277        if line.startswith(line_comment):
278            state.line = ''
279            i = len(line_comment)
280            if not keep_tokens:
281                line_comment = ' ' * i
282            return line_comment + line[i:], state
283    return '', state
284
285
286def parse_multiline_comment(lang, state, keep_tokens=True):
287    """
288    Returns the multi-line comment at the beginning of state.line,
289    otherwise the empty string.
290
291    Args:
292      lang (Language):
293        Syntax description for the language being parsed.
294      state (State):
295        Parser state
296      keep_tokens (bool, default: True):
297        If False, comment tokens are filtered out.
298        If True, comment tokens are preserved.
299
300    Returns:
301      (string, State)
302    """
303    line = state.line
304    for multi_start, multi_end in lang.comment_bookends:
305        if line.startswith(multi_start):
306            state.multi_end_stack.append(multi_end)
307            state.line = line[len(multi_start):]
308            cnts, cnts0, state = finish_multiline_comment(lang, state, keep_tokens)
309            if not keep_tokens:
310                multi_start = ' ' * len(multi_start)
311            return multi_start + cnts, cnts0, state
312    return '', '', state
313
314
315def finish_multiline_comment(lang, state, keep_tokens=True):
316    """
317    Returns the rest of a multi-line comment at the beginning of state.line.
318
319    Args:
320      lang (Language):
321        Syntax description for the language being parsed.
322      state (State):
323        Parser state
324      keep_tokens (bool, default: True):
325        If False, comment tokens are filtered out.
326        If True, comment tokens are preserved.
327
328    Returns:
329      (string, State)
330    """
331    cnts, state = parse_multiline_contents(lang, state)
332    multi_end = state.multi_end_stack[-1]
333
334    # Handle language supports nested comments.
335    if lang.nested_comments:
336        cmt, cmt0, state = parse_multiline_comment(lang, state, keep_tokens)
337    else:
338        cmt = ''
339        cmt0 = ''
340
341    line = state.line
342    if line:
343        if line.startswith(multi_end):
344            i = len(multi_end)
345            state.multi_end_stack.pop()
346            state.line = line[i:]
347            if not keep_tokens:
348                multi_end = ' ' * len(multi_end)
349            return cnts + cmt + cmt0, multi_end, state
350        else:
351            more_cnts, more_cnts0, state = finish_multiline_comment(lang, state, keep_tokens)
352            return cnts + cmt + cmt0 + more_cnts, more_cnts0, state
353    else:
354        return cnts + cmt + cmt0, '', state
355
356
357def parse_multiline_contents(lang, state):
358    """
359    Returns the multi-line comment contents at the beginning of state.line.
360
361    Args:
362      lang (Language):
363        Syntax description for the language being parsed.
364      state (State):
365        Parser state
366
367    Returns:
368      (string, State)
369    """
370    line = state.line
371    tokens = [start for start, end in lang.comment_bookends]
372    multi_end = state.multi_end_stack[-1]
373    tokens.append(multi_end)
374
375    if lang.nested_comments:
376        i = index_of_first_found(line, tokens)
377    else:
378        try:
379            i = line.index(multi_end)
380        except ValueError:
381            i = -1
382
383    if i != -1:
384        state.line = line[i:]
385        return line[:i], state
386    else:
387        # Reached the end of line before the end of comment.
388        state.line = ''
389        return line, state
390
391
392def index_of_first_found(s, xs):
393    """
394    Return the index of the first string from xs found in s.
395    """
396    regex = '|'.join(map(re.escape, xs))
397    m = re.search(regex, s)
398    if m:
399        return m.start()
400    else:
401        return -1
402
403
404def clear_line(line):
405    """
406    Return a string where each non-newline character is replaced with a space.
407    """
408    sep = get_linesep(line)
409    if sep:
410        return ' ' * (len(line) - len(sep)) + sep
411    else:
412        return ' ' * len(line)
413
414
415def get_linesep(line):
416    """
417    Returns the line separator if it exists, otherwise the empty string."
418    """
419    n = len(line)
420    if n >= 2 and line[-2:] == '\r\n':
421        return '\r\n'
422    elif n >= 1 and line[-1] == '\n':
423        return '\n'
424    else:
425        return ''
Note: See TracBrowser for help on using the repository browser.