third_party.pylibs.pylint.src/pylint/checkers/strings.py

# -*- coding: utf-8 -*-
# Copyright (c) 2009 Charles Hebert <charles.hebert@logilab.fr>
# Copyright (c) 2010-2014 LOGILAB S.A. (Paris, FRANCE) <contact@logilab.fr>
# Copyright (c) 2010 Daniel Harding <dharding@gmail.com>
# Copyright (c) 2012-2014 Google, Inc.
# Copyright (c) 2013-2018 Claudiu Popa <pcmanticore@gmail.com>
# Copyright (c) 2014 Brett Cannon <brett@python.org>
# Copyright (c) 2014 Arun Persaud <arun@nubati.net>
# Copyright (c) 2015 Rene Zhang <rz99@cornell.edu>
# Copyright (c) 2015 Ionel Cristian Maries <contact@ionelmc.ro>
# Copyright (c) 2016, 2018 Jakub Wilk <jwilk@jwilk.net>
# Copyright (c) 2016 Peter Dawyndt <Peter.Dawyndt@UGent.be>
# Copyright (c) 2017 Łukasz Rogalski <rogalski.91@gmail.com>
# Copyright (c) 2017 Ville Skyttä <ville.skytta@iki.fi>
# Copyright (c) 2018 Nick Drozd <nicholasdrozd@gmail.com>
# Copyright (c) 2018 Anthony Sottile <asottile@umich.edu>


# Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
# For details: https://github.com/PyCQA/pylint/blob/master/COPYING

"""Checker for string formatting operations.
"""

import builtins
import numbers
import tokenize
from collections import Counter

import astroid
from astroid.arguments import CallSite
from astroid.node_classes import Const

from pylint.checkers import BaseChecker, BaseTokenChecker, utils
from pylint.checkers.utils import check_messages
from pylint.interfaces import IAstroidChecker, IRawChecker, ITokenChecker

_AST_NODE_STR_TYPES = ("__builtin__.unicode", "__builtin__.str", "builtins.str")

MSGS = {
    "E1300": (
        "Unsupported format character %r (%#02x) at index %d",
        "bad-format-character",
        "Used when an unsupported format character is used in a format string.",
    ),
    "E1301": (
        "Format string ends in middle of conversion specifier",
        "truncated-format-string",
        "Used when a format string terminates before the end of a "
        "conversion specifier.",
    ),
    "E1302": (
        "Mixing named and unnamed conversion specifiers in format string",
        "mixed-format-string",
        "Used when a format string contains both named (e.g. '%(foo)d') "
        "and unnamed (e.g. '%d') conversion specifiers.  This is also "
        "used when a named conversion specifier contains * for the "
        "minimum field width and/or precision.",
    ),
    "E1303": (
        "Expected mapping for format string, not %s",
        "format-needs-mapping",
        "Used when a format string that uses named conversion specifiers "
        "is used with an argument that is not a mapping.",
    ),
    "W1300": (
        "Format string dictionary key should be a string, not %s",
        "bad-format-string-key",
        "Used when a format string that uses named conversion specifiers "
        "is used with a dictionary whose keys are not all strings.",
    ),
    "W1301": (
        "Unused key %r in format string dictionary",
        "unused-format-string-key",
        "Used when a format string that uses named conversion specifiers "
        "is used with a dictionary that contains keys not required by the "
        "format string.",
    ),
    "E1304": (
        "Missing key %r in format string dictionary",
        "missing-format-string-key",
        "Used when a format string that uses named conversion specifiers "
        "is used with a dictionary that doesn't contain all the keys "
        "required by the format string.",
    ),
    "E1305": (
        "Too many arguments for format string",
        "too-many-format-args",
        "Used when a format string that uses unnamed conversion "
        "specifiers is given too many arguments.",
    ),
    "E1306": (
        "Not enough arguments for format string",
        "too-few-format-args",
        "Used when a format string that uses unnamed conversion "
        "specifiers is given too few arguments",
    ),
    "E1307": (
        "Argument %r does not match format type %r",
        "bad-string-format-type",
        "Used when a type required by format string "
        "is not suitable for actual argument type",
    ),
    "E1310": (
        "Suspicious argument in %s.%s call",
        "bad-str-strip-call",
        "The argument to a str.{l,r,}strip call contains a duplicate character, ",
    ),
    "W1302": (
        "Invalid format string",
        "bad-format-string",
        "Used when a PEP 3101 format string is invalid.",
    ),
    "W1303": (
        "Missing keyword argument %r for format string",
        "missing-format-argument-key",
        "Used when a PEP 3101 format string that uses named fields "
        "doesn't receive one or more required keywords.",
    ),
    "W1304": (
        "Unused format argument %r",
        "unused-format-string-argument",
        "Used when a PEP 3101 format string that uses named "
        "fields is used with an argument that "
        "is not required by the format string.",
    ),
    "W1305": (
        "Format string contains both automatic field numbering "
        "and manual field specification",
        "format-combined-specification",
        "Used when a PEP 3101 format string contains both automatic "
        "field numbering (e.g. '{}') and manual field "
        "specification (e.g. '{0}').",
    ),
    "W1306": (
        "Missing format attribute %r in format specifier %r",
        "missing-format-attribute",
        "Used when a PEP 3101 format string uses an "
        "attribute specifier ({0.length}), but the argument "
        "passed for formatting doesn't have that attribute.",
    ),
    "W1307": (
        "Using invalid lookup key %r in format specifier %r",
        "invalid-format-index",
        "Used when a PEP 3101 format string uses a lookup specifier "
        "({a[1]}), but the argument passed for formatting "
        "doesn't contain or doesn't have that key as an attribute.",
    ),
    "W1308": (
        "Duplicate string formatting argument %r, consider passing as named argument",
        "duplicate-string-formatting-argument",
        "Used when we detect that a string formatting is "
        "repeating an argument instead of using named string arguments",
    ),
}

OTHER_NODES = (
    astroid.Const,
    astroid.List,
    astroid.Lambda,
    astroid.FunctionDef,
    astroid.ListComp,
    astroid.SetComp,
    astroid.GeneratorExp,
)

BUILTINS_STR = builtins.__name__ + ".str"
BUILTINS_FLOAT = builtins.__name__ + ".float"
BUILTINS_INT = builtins.__name__ + ".int"


def get_access_path(key, parts):
    """ Given a list of format specifiers, returns
    the final access path (e.g. a.b.c[0][1]).
    """
    path = []
    for is_attribute, specifier in parts:
        if is_attribute:
            path.append(".{}".format(specifier))
        else:
            path.append("[{!r}]".format(specifier))
    return str(key) + "".join(path)


def arg_matches_format_type(arg_type, format_type):
    if format_type in "sr":
        # All types can be printed with %s and %r
        return True
    if isinstance(arg_type, astroid.Instance):
        arg_type = arg_type.pytype()
        if arg_type == BUILTINS_STR:
            return format_type == "c"
        if arg_type == BUILTINS_FLOAT:
            return format_type in "deEfFgGn%"
        if arg_type == BUILTINS_INT:
            # Integers allow all types
            return True
        return False
    return True


class StringFormatChecker(BaseChecker):
    """Checks string formatting operations to ensure that the format string
    is valid and the arguments match the format string.
    """

    __implements__ = (IAstroidChecker,)
    name = "string"
    msgs = MSGS

    # pylint: disable=too-many-branches
    @check_messages(*MSGS)
    def visit_binop(self, node):
        if node.op != "%":
            return
        left = node.left
        args = node.right

        if not (isinstance(left, astroid.Const) and isinstance(left.value, str)):
            return
        format_string = left.value
        try:
            required_keys, required_num_args, required_key_types, required_arg_types = utils.parse_format_string(
                format_string
            )
        except utils.UnsupportedFormatCharacter as exc:
            formatted = format_string[exc.index]
            self.add_message(
                "bad-format-character",
                node=node,
                args=(formatted, ord(formatted), exc.index),
            )
            return
        except utils.IncompleteFormatString:
            self.add_message("truncated-format-string", node=node)
            return
        if required_keys and required_num_args:
            # The format string uses both named and unnamed format
            # specifiers.
            self.add_message("mixed-format-string", node=node)
        elif required_keys:
            # The format string uses only named format specifiers.
            # Check that the RHS of the % operator is a mapping object
            # that contains precisely the set of keys required by the
            # format string.
            if isinstance(args, astroid.Dict):
                keys = set()
                unknown_keys = False
                for k, _ in args.items:
                    if isinstance(k, astroid.Const):
                        key = k.value
                        if isinstance(key, str):
                            keys.add(key)
                        else:
                            self.add_message(
                                "bad-format-string-key", node=node, args=key
                            )
                    else:
                        # One of the keys was something other than a
                        # constant.  Since we can't tell what it is,
                        # suppress checks for missing keys in the
                        # dictionary.
                        unknown_keys = True
                if not unknown_keys:
                    for key in required_keys:
                        if key not in keys:
                            self.add_message(
                                "missing-format-string-key", node=node, args=key
                            )
                for key in keys:
                    if key not in required_keys:
                        self.add_message(
                            "unused-format-string-key", node=node, args=key
                        )
                for key, arg in args.items:
                    if not isinstance(key, astroid.Const):
                        continue
                    format_type = required_key_types.get(key.value, None)
                    arg_type = utils.safe_infer(arg)
                    if (
                        format_type is not None
                        and arg_type not in (None, astroid.Uninferable)
                        and not arg_matches_format_type(arg_type, format_type)
                    ):
                        self.add_message(
                            "bad-string-format-type",
                            node=node,
                            args=(arg_type.pytype(), format_type),
                        )
            elif isinstance(args, (OTHER_NODES, astroid.Tuple)):
                type_name = type(args).__name__
                self.add_message("format-needs-mapping", node=node, args=type_name)
            # else:
            # The RHS of the format specifier is a name or
            # expression.  It may be a mapping object, so
            # there's nothing we can check.
        else:
            # The format string uses only unnamed format specifiers.
            # Check that the number of arguments passed to the RHS of
            # the % operator matches the number required by the format
            # string.
            args_elts = ()
            if isinstance(args, astroid.Tuple):
                rhs_tuple = utils.safe_infer(args)
                num_args = None
                if hasattr(rhs_tuple, "elts"):
                    args_elts = rhs_tuple.elts
                    num_args = len(args_elts)
            elif isinstance(args, (OTHER_NODES, (astroid.Dict, astroid.DictComp))):
                args_elts = [args]
                num_args = 1
            else:
                # The RHS of the format specifier is a name or
                # expression.  It could be a tuple of unknown size, so
                # there's nothing we can check.
                num_args = None
            if num_args is not None:
                if num_args > required_num_args:
                    self.add_message("too-many-format-args", node=node)
                elif num_args < required_num_args:
                    self.add_message("too-few-format-args", node=node)
                for arg, format_type in zip(args_elts, required_arg_types):
                    if not arg:
                        continue
                    arg_type = utils.safe_infer(arg)
                    if arg_type not in (
                        None,
                        astroid.Uninferable,
                    ) and not arg_matches_format_type(arg_type, format_type):
                        self.add_message(
                            "bad-string-format-type",
                            node=node,
                            args=(arg_type.pytype(), format_type),
                        )

    @check_messages(*MSGS)
    def visit_call(self, node):
        func = utils.safe_infer(node.func)
        if (
            isinstance(func, astroid.BoundMethod)
            and isinstance(func.bound, astroid.Instance)
            and func.bound.name in ("str", "unicode", "bytes")
        ):
            if func.name in ("strip", "lstrip", "rstrip") and node.args:
                arg = utils.safe_infer(node.args[0])
                if not isinstance(arg, astroid.Const) or not isinstance(arg.value, str):
                    return
                if len(arg.value) != len(set(arg.value)):
                    self.add_message(
                        "bad-str-strip-call",
                        node=node,
                        args=(func.bound.name, func.name),
                    )
            elif func.name == "format":
                self._check_new_format(node, func)

    def _detect_vacuous_formatting(self, node, positional_arguments):
        counter = Counter(
            arg.name for arg in positional_arguments if isinstance(arg, astroid.Name)
        )
        for name, count in counter.items():
            if count == 1:
                continue
            self.add_message(
                "duplicate-string-formatting-argument", node=node, args=(name,)
            )

    def _check_new_format(self, node, func):
        """Check the new string formatting. """
        # Skip ormat nodes which don't have an explicit string on the
        # left side of the format operation.
        # We do this because our inference engine can't properly handle
        # redefinitions of the original string.
        # Note that there may not be any left side at all, if the format method
        # has been assigned to another variable. See issue 351. For example:
        #
        #    fmt = 'some string {}'.format
        #    fmt('arg')
        if isinstance(node.func, astroid.Attribute) and not isinstance(
            node.func.expr, astroid.Const
        ):
            return
        if node.starargs or node.kwargs:
            return
        try:
            strnode = next(func.bound.infer())
        except astroid.InferenceError:
            return
        if not (isinstance(strnode, astroid.Const) and isinstance(strnode.value, str)):
            return
        try:
            call_site = CallSite.from_call(node)
        except astroid.InferenceError:
            return

        try:
            fields, num_args, manual_pos = utils.parse_format_method_string(
                strnode.value
            )
        except utils.IncompleteFormatString:
            self.add_message("bad-format-string", node=node)
            return

        positional_arguments = call_site.positional_arguments
        named_arguments = call_site.keyword_arguments
        named_fields = {field[0] for field in fields if isinstance(field[0], str)}
        if num_args and manual_pos:
            self.add_message("format-combined-specification", node=node)
            return

        check_args = False
        # Consider "{[0]} {[1]}" as num_args.
        num_args += sum(1 for field in named_fields if field == "")
        if named_fields:
            for field in named_fields:
                if field and field not in named_arguments:
                    self.add_message(
                        "missing-format-argument-key", node=node, args=(field,)
                    )
            for field in named_arguments:
                if field not in named_fields:
                    self.add_message(
                        "unused-format-string-argument", node=node, args=(field,)
                    )
            # num_args can be 0 if manual_pos is not.
            num_args = num_args or manual_pos
            if positional_arguments or num_args:
                empty = any(True for field in named_fields if field == "")
                if named_arguments or empty:
                    # Verify the required number of positional arguments
                    # only if the .format got at least one keyword argument.
                    # This means that the format strings accepts both
                    # positional and named fields and we should warn
                    # when one of the them is missing or is extra.
                    check_args = True
        else:
            check_args = True
        if check_args:
            # num_args can be 0 if manual_pos is not.
            num_args = num_args or manual_pos
            if len(positional_arguments) > num_args:
                self.add_message("too-many-format-args", node=node)
            elif len(positional_arguments) < num_args:
                self.add_message("too-few-format-args", node=node)

        self._detect_vacuous_formatting(node, positional_arguments)
        self._check_new_format_specifiers(node, fields, named_arguments)

    def _check_new_format_specifiers(self, node, fields, named):
        """
        Check attribute and index access in the format
        string ("{0.a}" and "{0[a]}").
        """
        for key, specifiers in fields:
            # Obtain the argument. If it can't be obtained
            # or inferred, skip this check.
            if key == "":
                # {[0]} will have an unnamed argument, defaulting
                # to 0. It will not be present in `named`, so use the value
                # 0 for it.
                key = 0
            if isinstance(key, numbers.Number):
                try:
                    argname = utils.get_argument_from_call(node, key)
                except utils.NoSuchArgumentError:
                    continue
            else:
                if key not in named:
                    continue
                argname = named[key]
            if argname in (astroid.Uninferable, None):
                continue
            try:
                argument = next(argname.infer())
            except astroid.InferenceError:
                continue
            if not specifiers or argument is astroid.Uninferable:
                # No need to check this key if it doesn't
                # use attribute / item access
                continue
            if argument.parent and isinstance(argument.parent, astroid.Arguments):
                # Ignore any object coming from an argument,
                # because we can't infer its value properly.
                continue
            previous = argument
            parsed = []
            for is_attribute, specifier in specifiers:
                if previous is astroid.Uninferable:
                    break
                parsed.append((is_attribute, specifier))
                if is_attribute:
                    try:
                        previous = previous.getattr(specifier)[0]
                    except astroid.NotFoundError:
                        if (
                            hasattr(previous, "has_dynamic_getattr")
                            and previous.has_dynamic_getattr()
                        ):
                            # Don't warn if the object has a custom __getattr__
                            break
                        path = get_access_path(key, parsed)
                        self.add_message(
                            "missing-format-attribute",
                            args=(specifier, path),
                            node=node,
                        )
                        break
                else:
                    warn_error = False
                    if hasattr(previous, "getitem"):
                        try:
                            previous = previous.getitem(astroid.Const(specifier))
                        except (
                            astroid.AstroidIndexError,
                            astroid.AstroidTypeError,
                            astroid.AttributeInferenceError,
                        ):
                            warn_error = True
                        except astroid.InferenceError:
                            break
                        if previous is astroid.Uninferable:
                            break
                    else:
                        try:
                            # Lookup __getitem__ in the current node,
                            # but skip further checks, because we can't
                            # retrieve the looked object
                            previous.getattr("__getitem__")
                            break
                        except astroid.NotFoundError:
                            warn_error = True
                    if warn_error:
                        path = get_access_path(key, parsed)
                        self.add_message(
                            "invalid-format-index", args=(specifier, path), node=node
                        )
                        break

                try:
                    previous = next(previous.infer())
                except astroid.InferenceError:
                    # can't check further if we can't infer it
                    break


class StringConstantChecker(BaseTokenChecker):
    """Check string literals"""

    __implements__ = (IAstroidChecker, ITokenChecker, IRawChecker)
    name = "string"
    msgs = {
        "W1401": (
            "Anomalous backslash in string: '%s'. "
            "String constant might be missing an r prefix.",
            "anomalous-backslash-in-string",
            "Used when a backslash is in a literal string but not as an escape.",
        ),
        "W1402": (
            "Anomalous Unicode escape in byte string: '%s'. "
            "String constant might be missing an r or u prefix.",
            "anomalous-unicode-escape-in-string",
            "Used when an escape like \\u is encountered in a byte "
            "string where it has no effect.",
        ),
        "W1403": (
            "Implicit string concatenation found in %s",
            "implicit-str-concat-in-sequence",
            "String literals are implicitly concatenated in a "
            "literal iterable definition : "
            "maybe a comma is missing ?",
        ),
    }
    options = (
        (
            "check-str-concat-over-line-jumps",
            {
                "default": False,
                "type": "yn",
                "metavar": "<y_or_n>",
                "help": "This flag controls whether the "
                "implicit-str-concat-in-sequence should generate a warning "
                "on implicit string concatenation in sequences defined over "
                "several lines.",
            },
        ),
    )

    # Characters that have a special meaning after a backslash in either
    # Unicode or byte strings.
    ESCAPE_CHARACTERS = "abfnrtvx\n\r\t\\'\"01234567"

    # Characters that have a special meaning after a backslash but only in
    # Unicode strings.
    UNICODE_ESCAPE_CHARACTERS = "uUN"

    def __init__(self, *args, **kwargs):
        super(StringConstantChecker, self).__init__(*args, **kwargs)
        self.string_tokens = {}  # token position -> (token value, next token)

    def process_module(self, module):
        self._unicode_literals = "unicode_literals" in module.future_imports

    def process_tokens(self, tokens):
        encoding = "ascii"
        for i, (tok_type, token, start, _, line) in enumerate(tokens):
            if tok_type == tokenize.ENCODING:
                # this is always the first token processed
                encoding = token
            elif tok_type == tokenize.STRING:
                # 'token' is the whole un-parsed token; we can look at the start
                # of it to see whether it's a raw or unicode string etc.
                self.process_string_token(token, start[0])
                # We figure the next token, ignoring comments & newlines:
                j = i + 1
                while j < len(tokens) and tokens[j].type in (
                    tokenize.NEWLINE,
                    tokenize.NL,
                    tokenize.COMMENT,
                ):
                    j += 1
                next_token = tokens[j] if j < len(tokens) else None
                if encoding != "ascii":
                    # We convert `tokenize` character count into a byte count,
                    # to match with astroid `.col_offset`
                    start = (start[0], len(line[: start[1]].encode(encoding)))
                self.string_tokens[start] = (str_eval(token), next_token)

    @check_messages(*(msgs.keys()))
    def visit_list(self, node):
        self.check_for_concatenated_strings(node, "list")

    @check_messages(*(msgs.keys()))
    def visit_set(self, node):
        self.check_for_concatenated_strings(node, "set")

    @check_messages(*(msgs.keys()))
    def visit_tuple(self, node):
        self.check_for_concatenated_strings(node, "tuple")

    def check_for_concatenated_strings(self, iterable_node, iterable_type):
        for elt in iterable_node.elts:
            if isinstance(elt, Const) and elt.pytype() in _AST_NODE_STR_TYPES:
                if elt.col_offset < 0:
                    # This can happen in case of escaped newlines
                    continue
                if (elt.lineno, elt.col_offset) not in self.string_tokens:
                    # This may happen with Latin1 encoding
                    # cf. https://github.com/PyCQA/pylint/issues/2610
                    continue
                matching_token, next_token = self.string_tokens[
                    (elt.lineno, elt.col_offset)
                ]
                # We detect string concatenation: the AST Const is the
                # combination of 2 string tokens
                if matching_token != elt.value and next_token is not None:
                    if next_token.type == tokenize.STRING and (
                        next_token.start[0] == elt.lineno
                        or self.config.check_str_concat_over_line_jumps
                    ):
                        self.add_message(
                            "implicit-str-concat-in-sequence",
                            line=elt.lineno,
                            args=(iterable_type,),
                        )

    def process_string_token(self, token, start_row):
        quote_char = None
        index = None
        for index, char in enumerate(token):
            if char in "'\"":
                quote_char = char
                break
        if quote_char is None:
            return

        prefix = token[:index].lower()  # markers like u, b, r.
        after_prefix = token[index:]
        if after_prefix[:3] == after_prefix[-3:] == 3 * quote_char:
            string_body = after_prefix[3:-3]
        else:
            string_body = after_prefix[1:-1]  # Chop off quotes
        # No special checks on raw strings at the moment.
        if "r" not in prefix:
            self.process_non_raw_string_token(prefix, string_body, start_row)

    def process_non_raw_string_token(self, prefix, string_body, start_row):
        """check for bad escapes in a non-raw string.

        prefix: lowercase string of eg 'ur' string prefix markers.
        string_body: the un-parsed body of the string, not including the quote
        marks.
        start_row: integer line number in the source.
        """
        # Walk through the string; if we see a backslash then escape the next
        # character, and skip over it.  If we see a non-escaped character,
        # alert, and continue.
        #
        # Accept a backslash when it escapes a backslash, or a quote, or
        # end-of-line, or one of the letters that introduce a special escape
        # sequence <http://docs.python.org/reference/lexical_analysis.html>
        #
        index = 0
        while True:
            index = string_body.find("\\", index)
            if index == -1:
                break
            # There must be a next character; having a backslash at the end
            # of the string would be a SyntaxError.
            next_char = string_body[index + 1]
            match = string_body[index : index + 2]
            if next_char in self.UNICODE_ESCAPE_CHARACTERS:
                if "u" in prefix:
                    pass
                elif "b" not in prefix:
                    pass  # unicode by default
                else:
                    self.add_message(
                        "anomalous-unicode-escape-in-string",
                        line=start_row,
                        args=(match,),
                        col_offset=index,
                    )
            elif next_char not in self.ESCAPE_CHARACTERS:
                self.add_message(
                    "anomalous-backslash-in-string",
                    line=start_row,
                    args=(match,),
                    col_offset=index,
                )
            # Whether it was a valid escape or not, backslash followed by
            # another character can always be consumed whole: the second
            # character can never be the start of a new backslash escape.
            index += 2


def register(linter):
    """required method to auto register this checker """
    linter.register_checker(StringFormatChecker(linter))
    linter.register_checker(StringConstantChecker(linter))


def str_eval(token):
    """
    Mostly replicate `ast.literal_eval(token)` manually to avoid any performance hit.
    This supports f-strings, contrary to `ast.literal_eval`.
    We have to support all string literal notations:
    https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
    """
    if token[0:2].lower() in ("fr", "rf"):
        token = token[2:]
    elif token[0].lower() in ("r", "u", "f"):
        token = token[1:]
    if token[0:3] in ('"""', "'''"):
        return token[3:-3]
    return token[1:-1]