parser.py

# Copyright (c) 2016-2017 Jani Nikula <jani@nikula.org>
# Copyright (c) 2018-2019 Bruno Santos <brunomanuelsantos@tecnico.ulisboa.pt>
# Licensed under the terms of BSD 2-Clause, see LICENSE for details.
"""
Documentation comment extractor
===============================

This module extracts relevant documentation comments, optionally reformatting
them in reST syntax.

This is the part that uses Clang Python Bindings to extract documentation
comments from C source code. This module does not depend on Sphinx.

There are two passes:

#. Pass over the tokens to find all the comments, including ones that aren't
   attached to cursors.

#. Pass over the cursors to document them.

There is minimal syntax parsing or input conversion:

* Identification of documentation comment blocks, and stripping the comment
  delimiters (``/**`` and ``*/``) and continuation line prefixes (e.g. ``␣*␣``).

* Identification of function-like macros.

* Indentation for reST C Domain directive blocks.

* An optional external filter may be invoked to support different syntaxes.
  These filters are expected to translate the comment into the reST format.

Otherwise, documentation comments are passed through verbatim.
"""

import enum
import itertools
import sys

from clang.cindex import CursorKind, TypeKind
from clang.cindex import Index, TranslationUnit
from clang.cindex import SourceLocation, SourceRange
from clang.cindex import TokenKind, TokenGroup

from hawkmoth.util import docstr, doccompat

class ErrorLevel(enum.Enum):
    """
    Supported error levels in inverse numerical order of severity. The values
    are chosen so that they map directly to a 'verbosity level'.
    """
    ERROR = 0
    WARNING = 1
    INFO = 2
    DEBUG = 3

def comment_extract(tu):

    # FIXME: How to handle top level comments above a cursor that it does *not*
    # describe? Parsing @file or @doc at this stage would not be a clean design.
    # One idea is to use '/***' to denote them, but that might throw off editor
    # highlighting. The workaround is to follow the top level comment with an
    # empty '/**/' comment that gets attached to the cursor.

    top_level_comments = []
    comments = {}
    cursor = None
    current_comment = None
    for token in tu.get_tokens(extent=tu.cursor.extent):
        # handle all comments we come across
        if token.kind == TokenKind.COMMENT:
            # if we already have a comment, it wasn't related to a cursor
            if current_comment and docstr.is_doc(current_comment.spelling):
                top_level_comments.append(current_comment)
            current_comment = token
            continue

        # cursors that are 1) never documented themselves, and 2) allowed
        # between comment and the actual cursor being documented
        if (token.cursor.kind == CursorKind.INVALID_FILE or
            token.cursor.kind == CursorKind.TYPE_REF or
            token.cursor.kind == CursorKind.PREPROCESSING_DIRECTIVE or
            token.cursor.kind == CursorKind.MACRO_INSTANTIATION):
            continue

        if cursor is not None and token.cursor == cursor:
            continue

        cursor = token.cursor

        # Note: current_comment may be None
        if current_comment != None and docstr.is_doc(current_comment.spelling):
            comments[cursor.hash] = current_comment
        current_comment = None

    # comment at the end of file
    if current_comment and docstr.is_doc(current_comment.spelling):
        top_level_comments.append(current_comment)

    return top_level_comments, comments

def _result(comment, cursor=None, fmt=docstr.Type.TEXT, nest=0,
            name=None, ttype=None, args=None, compat=None):

    # FIXME: docstr.generate changes the number of lines in output. This impacts
    # the error reporting via meta['line']. Adjust meta to take this into
    # account.

    doc = docstr.generate(text=comment.spelling, fmt=fmt,
                          name=name, ttype=ttype, args=args, transform=compat)

    doc = docstr.nest(doc, nest)

    meta = {'line': comment.extent.start.line}
    if cursor:
        meta['cursor.kind']        = cursor.kind,
        meta['cursor.displayname'] = cursor.displayname,
        meta['cursor.spelling']    = cursor.spelling

    return [(doc, meta)]

# Return None for simple macros, a potentially empty list of arguments for
# function-like macros
def _get_macro_args(cursor):
    if cursor.kind != CursorKind.MACRO_DEFINITION:
        return None

    # Use the first two tokens to make sure this starts with 'IDENTIFIER('
    x = [token for token in itertools.islice(cursor.get_tokens(), 2)]
    if (len(x) != 2 or x[0].spelling != cursor.spelling or
        x[1].spelling != '(' or x[0].extent.end != x[1].extent.start):
        return None

    # Naïve parsing of macro arguments
    # FIXME: This doesn't handle GCC named vararg extension FOO(vararg...)
    args = []
    for token in itertools.islice(cursor.get_tokens(), 2, None):
        if token.spelling == ')':
            return args
        elif token.spelling == ',':
            continue
        elif token.kind == TokenKind.IDENTIFIER:
            args.append(token.spelling)
        elif token.spelling == '...':
            args.append(token.spelling)
        else:
            break

    return None

def _recursive_parse(comments, cursor, nest, compat):
    comment = comments[cursor.hash]
    name = cursor.spelling
    ttype = cursor.type.spelling

    if cursor.kind == CursorKind.MACRO_DEFINITION:
        # FIXME: check args against comment
        args = _get_macro_args(cursor)
        fmt = docstr.Type.MACRO if args is None else docstr.Type.MACRO_FUNC

        return _result(comment, cursor=cursor, fmt=fmt,
                       nest=nest, name=name, args=args, compat=compat)

    elif cursor.kind == CursorKind.VAR_DECL:
        fmt = docstr.Type.VAR

        return _result(comment, cursor=cursor, fmt=fmt,
                       nest=nest, name=name, ttype=ttype, compat=compat)

    elif cursor.kind == CursorKind.TYPEDEF_DECL:
        # FIXME: function pointers typedefs.
        fmt = docstr.Type.TYPE

        return _result(comment, cursor=cursor, fmt=fmt,
                       nest=nest, name=ttype, compat=compat)

    elif cursor.kind in [CursorKind.STRUCT_DECL, CursorKind.UNION_DECL,
                         CursorKind.ENUM_DECL]:

        # FIXME:
        # Handle cases where variables are instantiated on type declaration,
        # including anonymous cases. Idea is that if there is a variable
        # instantiation, the documentation should be applied to the variable if
        # the structure is anonymous or to the type otherwise.
        #
        # Due to the new recursiveness of the parser, fixing this here, _should_
        # handle all cases (struct, union, enum).

        # FIXME: Handle anonymous enumerators.

        fmt = docstr.Type.TYPE
        result = _result(comment, cursor=cursor, fmt=fmt,
                         nest=nest, name=ttype, compat=compat)

        nest += 1
        for c in cursor.get_children():
            if c.hash in comments:
                result.extend(_recursive_parse(comments, c, nest, compat))

        return result

    elif cursor.kind == CursorKind.ENUM_CONSTANT_DECL:
        fmt = docstr.Type.ENUM_VAL

        return _result(comment, cursor=cursor, fmt=fmt,
                       nest=nest, name=name, compat=compat)

    elif cursor.kind == CursorKind.FIELD_DECL:
        fmt = docstr.Type.MEMBER

        return _result(comment, cursor=cursor, fmt=fmt,
                       nest=nest, name=name, ttype=ttype, compat=compat)

    elif cursor.kind == CursorKind.FUNCTION_DECL:
        # FIXME: check args against comment
        # FIXME: children may contain extra stuff if the return type is a
        # typedef, for example
        args = []

        # Only fully prototyped functions will have argument lists to process.
        if cursor.type.kind == TypeKind.FUNCTIONPROTO:
            for c in cursor.get_children():
                if c.kind == CursorKind.PARM_DECL:
                    args.append('{ttype} {arg}'.format(ttype=c.type.spelling,
                                                    arg=c.spelling))

            if cursor.type.is_function_variadic():
                args.append('...')

        fmt = docstr.Type.FUNC
        ttype = cursor.result_type.spelling

        return _result(comment, cursor=cursor, fmt=fmt, nest=nest,
                       name=name, ttype=ttype, args=args, compat=compat)

    # FIXME: If we reach here, nothing matched. This is a warning or even error
    # and it should be logged, but it should also return an empty list so that
    # it doesn't break. I.e. the parser needs to pass warnings and errors to the
    # Sphinx extension instead of polluting the generated output.
    fmt = docstr.Type.TEXT
    text = 'warning: unhandled cursor {kind} {name}\n'.format(
        kind=str(cursor.kind),
        name=cursor.spelling)

    doc = docstr.generate(text=text, fmt=fmt)

    meta = {
        'line':               comment.extent.start.line,
        'cursor.kind':        cursor.kind,
        'cursor.displayname': cursor.displayname,
        'cursor.spelling':    cursor.spelling
    }

    return [(doc, meta)]

def clang_diagnostics(errors, diagnostics):
    sev = {0: ErrorLevel.DEBUG,
           1: ErrorLevel.DEBUG,
           2: ErrorLevel.WARNING,
           3: ErrorLevel.ERROR,
           4: ErrorLevel.ERROR}

    for diag in diagnostics:
        errors.extend([(sev[diag.severity], diag.location.file.name,
                        diag.location.line, diag.spelling)])

# return a list of (comment, metadata) tuples
# options - dictionary with directive options
def parse(filename, **options):

    errors = []
    args = options.get('clang')
    if args is not None:
        args = [s.strip() for s in args.split(',') if len(s.strip()) > 0]
        if len(args) == 0:
            args = None

    index = Index.create()

    tu = index.parse(filename, args=args, options=
                     TranslationUnit.PARSE_DETAILED_PROCESSING_RECORD |
                     TranslationUnit.PARSE_SKIP_FUNCTION_BODIES)

    clang_diagnostics(errors, tu.diagnostics)

    top_level_comments, comments = comment_extract(tu)

    result = []
    compat = lambda x: doccompat.convert(x, options.get('compat'))

    for comment in top_level_comments:
        result.extend(_result(comment, compat=compat))

    for cursor in tu.cursor.get_children():
        if cursor.hash in comments:
            result.extend(_recursive_parse(comments, cursor, 0, compat))

    # Sort all elements by order of appearance.
    result.sort(key=lambda r: r[1]['line'])

    return result, errors