Source code for interact.parse

# Copyright (c) 2013 Galah Group LLC
# Copyright (c) 2013 Other contributers as noted in the CONTRIBUTERS file
#
# This file is part of galah-interact-python.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
#
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This module is useful when attempting to roughly parse students' code (ex:
trying to check that indentation was properly used). This module does not
attempt to, and never will, try and fully parse C++. If such facilities are
added to Galah Interact they will probably be added as a seperate module that
provides a nice abstraction to Clang.

"""

[docs]class Block:
    """
    Represents a block of code.

    :ivar lines: A list of ``Line`` objects that make up this block.
    :ivar sub_blocks: A list of ``Block`` objects that are children of
            this block.

    """

    def __init__(self, lines, sub_blocks = None):
        if sub_blocks is None:
            sub_blocks = []

        self.lines = lines
        self.sub_blocks = sub_blocks

    def _to_str_list(self, indent_level = 0):
        result = [("\t" * indent_level) + repr(i) for i in self.lines]
        for i in self.sub_blocks:
            result += i._to_str_list(indent_level = indent_level + 1)

        return result

    def __str__(self):
        return "\n".join(self._to_str_list())

[docs]class Line:
    """
    Represents a line of code.

    :ivar code: The contents of the line.
    :ivar line_number: The line number.

    """

    def __init__(self, line_number, code):
        self.code = code
        self.line_number = line_number

    def __str__(self):
        return self.code

    def __repr__(self):
        return "Line(%d, %s)" % (self.line_number, repr(self.code))

[docs]    def indent_level(self):
        """
        Determines the indentation level of the current line.

        :returns: The sum of the number of tabs and the number of spaces at the
                  start of the line. Iff the line is blank (not including
                  whitespace), ``None`` is returned.

        """

        # Iterates through the line character by character until a
        # non-whitespace character is hit.
        for i, c in enumerate(self.code):
            if c not in [" ", "\t"]:
                break
        else:
            # If we never hit a non-whitespace character...
            return None

        # i is the position of the first non-whitespace character, and because i
        # is zero-indexed, i is also the number of whitespace characters we saw.
        return i

    @classmethod
[docs]    def make_lines(cls, lines, start = 1):
        """
        Creates a list of Line objects from a list of strings representing
        lines in a file.

        :param lines: A list of strings where each string is a line in a file.
        :param start: The line number of the first line in ``lines``.
        :returns: A list of line objects.

        >>> Line.make_lines(["int main() {", "   return 0;", "}"], 1)
        [
            Line(1, "int main() {"),
            Line(2, "   return 0;"),
            Line(3, "}")
        ]

        """

        return (cls(n, line) for n, line in enumerate(lines))

    @staticmethod
[docs]    def lines_to_str_list(lines):
        """
        Creates a list of strings from a list of ``Line`` objects.

        :param lines: A list of ``Line`` objects.
        :returns: A list of strings.

        >>> my_lines = [
            Line(1, "int main() {"),
            Line(2, "   return 0;"),
            Line(3, "}")
        ]
        >>> Line.lines_to_str_list(my_lines)
        [
            "int main() {",
            "    return 0;",
            "}"
        ]

        """
        return [i.code for i in lines]

    @staticmethod
[docs]    def lines_to_str(lines):
        """
        Creates a single string from a list of ``Line`` objects.

        :param lines: A list of ``Line`` objects.
        :returns: A single string.

        >>> my_lines = [
            Line(1, "int main() {"),
            Line(2, "   return 0;"),
            Line(3, "}")
        ]
        >>> Line.lines_to_str(my_lines)
        "int main() {\\n    return 0\\n}\\n"

        """

        return "\n".join(Line.lines_to_str_list(lines))

    def __eq__(self, other):
        """
        Determines if two lines are equal.

        :returns: ``True`` if the two lines have the same code in them and the
                  same line number, ``False`` otherwise.

        >>> Line("foo()", 2) == Line("foo()", 2)
        True
        >>> Line("foo()", 2) == Line("foo()", 3)
        False
        >>> Line("foo()", 2) == Line("    foo()", 2)
        False

        """

        return self.code == other.code and self.line_number == other.line_number

[docs]def grab_blocks(lines):
    """
    Finds all blocks created using curly braces (does not handle two line if
    statements for example).

    :param lines: A list of ``Line`` objects.
    :returns: A single ``Block`` object which can be traversed like a tree.

    >>> my_lines = [
    ...     Line(0, "#include <iostream>"),
    ...     Line(1, ""),
    ...     Line(2, "using namespace std;"),
    ...     Line(3, ""),
    ...     Line(4, "int main() {"),
    ...     Line(5, '    cout << "Hello world" << endl;'),
    ...     Line(6, "    return 0"),
    ...     Line(7, "}")
    ... ]
    >>> grab_blocks(my_lines)
    Block(
        lines = [
            Line(0, "#include <iostream>"),
            Line(1, ""),
            Line(2, "using namespace std;"),
            Line(3, ""),
            Line(4, "int main() {"),
            Line(7, "}")
        ],
        sub_blocks = [
            Block(
                lines = [
                    Line(5, '    cout << "Hello world" << endl;'),
                    Line(6, "    return 0")
                ],
                sub_blocks = None
            )
        ]
    )

    *(Note that I formatted the above example specially, it won't actually
    print out so beautifully if you try it yourself, but the content will be
    the same)*

    """

    # The number of nested blocks the current line is in relative to our
    # starting point. We will only look at the indentation for lines with
    # in_block == 0, all other lines we will recursively defer.
    in_block = 0

    # These are all of the lines that are within the current block (and not
    # within any sub blocks).
    lines_to_check = []

    # Contains the actual result (the list of two-tuples).
    sub_blocks = []

    # When we encounter code that is in a sub block, we push it onto this list.
    # Then when we get out of that sub block, we recurse on these lines and then
    # empty the list.
    unhandled_chunk = []

    for line in lines:
        # Remove all quoted strings from the line, that way when we search for
        # a curly brace we know it's an actual curly brace.
        stripped_line = cleanse_quoted_strings(line.code)

        # Will be set to true if the the current line is in the current block.
        include_current_line = False

        # Figure out if we are ending a block here
        for char in stripped_line:
            # If at any point we reach 0, that means we want to include this
            # line in the current block. This may occur if you have something
            # like "} else {".
            if in_block == 0:
                include_current_line = True

            # See comment above the in_block initialization.
            if char == "{":
                in_block += 1
            elif char == "}":
                in_block -= 1

        # Check if we ended a block here.
        if in_block == 0:
            include_current_line = True

        if include_current_line:
            # If we were just looking at a chunk of code in a sub block,
            # recurse properly.
            if unhandled_chunk:
                unhandled_block = grab_blocks(unhandled_chunk)
                if unhandled_block:
                    sub_blocks.append(unhandled_block)

                unhandled_chunk = []

            lines_to_check.append(line)
        else:
            unhandled_chunk.append(line)

    if not lines_to_check and not sub_blocks:
        return None
    else:
        return Block(lines_to_check, sub_blocks)

[docs]def cleanse_quoted_strings(line):
    """
    Removes all quoted strings from a line. Single quotes are treated the same
    as double quotes.

    Escaped quotes are handled. A forward slash is assumed to be the escape
    character. Escape sequences are not processed (meaning `\"` does not become
    `"`, it just remains as `\"`).

    :param line: A string to be cleansed.
    :returns: The line without any quoted strings.

    >>> cleanse_quoted_strings("I am 'John Sullivan', creator of worlds.")
    "I am , creator of worlds."
    >>> cleanse_quoted_strings(
    ...     'I am "John Sullivan \\"the Destroyer\\", McGee", fear me.'
    ... )
    "I am , fear me."

    This function is of particular use when trying to detect curly braces or
    other language constructs, and you don't want to be fooled by the symbols
    appearing in string literals.

    """

    # Returns ' if " is given, returns " if ' is given.
    inv_quote = lambda x: "'" if x == "\"" else "\""

    is_quote = lambda x: x in ["\"", "'"]

    # Will be a list of characters that we will join together to get the
    # resulting string sans quoted strings.
    unquoted_string = []

    in_quotes = None
    for i, char in enumerate(line):
        # Check to see if this character is escaped (this will occur if there is
        # an odd number of back slashes in front of it).
        num_slashes = 0
        for j in reversed(range(0, i)):
            if line[j] == "\\":
                num_slashes += 1
            else:
                break
        escaped = num_slashes % 2 == 1

        if char == in_quotes and not escaped:
            in_quotes = None
            continue
        elif is_quote(char) and in_quotes is None:
            in_quotes = char
            continue

        if in_quotes is None:
            unquoted_string.append(char)

    return "".join(unquoted_string)

#: Lines of code to ignore when looking for bad indentation. See
#: :func:`find_bad_indentation` for more information.
INDENT_EXCEPTED_LINES = ["public:", "private:", "protected:"]

[docs]def find_bad_indentation(block, minimum = None):
    """
    Detects blocks of code that are not indented more than their parent blocks.

    :param block: The top-level block of code. Sub-blocks will be recursively
                  checked.
    :param minimum: The minimum level of indentation required for the top-level
                    block. Mainly useful due to this function's recursive
                    nature.
    :returns: A list of ``Line`` objects where each ``Line`` had a problem with
              its indentation.

    .. note::

        Lines that match (after removing whitespace) lines in
        :data:`INDENT_EXCEPTED_LINES` will be ignored.

    >>> my_block = Block(
    ...     lines = [
    ...         Line(0,  "#include <iostream>"),
    ...         Line(1,  ""),
    ...         Line(2,  "using namespace std;"),
    ...         Line(3,  ""),
    ...         Line(4,  "int main() {"),
    ...         Line(15, "}")
    ...     ],
    ...     sub_blocks = [
    ...         Block(
    ...             lines = [
    ...                 Line(5,  '    cout << "{" << endl;'),
    ...                 Line(6,  "    if (true)"),
    ...                 Line(7,  "    {"),
    ...                 Line(9,  "    } else {"),
    ...                 Line(12, "    }"),
    ...                 Line(13, "        pinata"),
    ...                 Line(14, "    return 0")
    ...             ],
    ...             sub_blocks = [
    ...                 Block(
    ...                     lines = [
    ...                         Line(8, "        return false;")
    ...                     ]
    ...                 ),
    ...                 Block(
    ...                     lines = [
    ...                         Line(10, "        return true;"),
    ...                         Line(11, "oh noz")
    ...                     ]
    ...                 )
    ...             ]
    ...         )
    ...     ]
    ... )
    >>> find_bad_indentation(my_block)
    [Line(11, "oh noz")]

    """

    problems = []

    # Check that each line in the current block has an indentation level
    # strictly greater than the minimum.
    for i in block.lines:
        if i.code.strip() in INDENT_EXCEPTED_LINES:
            continue

        indent_level = i.indent_level()
        if minimum is not None and indent_level is not None and \
                indent_level <= minimum:
            problems.append(i)

    # Find the indent level of the least indented line in the current block
    levels = []
    for i in block.lines:
        if i.code.strip() in INDENT_EXCEPTED_LINES:
            continue

        level = i.indent_level()
        if level is not None:
            levels.append(level)
    new_minimum = min(levels)

    # Recurse into every sub block
    for i in block.sub_blocks:
        problems += find_bad_indentation(i, new_minimum)

    return problems