codeql/python/ql/src/Expressions/HashedButNoHash.ql

/**
 * @name Unhashable object hashed
 * @description Hashing an object which is not hashable will result in a TypeError at runtime.
 * @kind problem
 * @tags reliability
 *       correctness
 * @problem.severity error
 * @sub-severity low
 * @precision very-high
 * @id py/hash-unhashable-value
 */

import python

/*
 * This assumes that any indexing operation where the value is not a sequence or numpy array involves hashing.
 * For sequences, the index must be an int, which are hashable, so we don't need to treat them specially.
 * For numpy arrays, the index may be a list, which are not hashable and needs to be treated specially.
 */

predicate numpy_array_type(ClassValue na) {
  exists(ModuleValue np | np.getName() = "numpy" or np.getName() = "numpy.core" |
    na.getASuperType() = np.attr("ndarray")
  )
}

predicate has_custom_getitem(Value v) {
  v.getClass().lookup("__getitem__") instanceof PythonFunctionValue
  or
  numpy_array_type(v.getClass())
}

predicate explicitly_hashed(ControlFlowNode f) {
  exists(CallNode c, GlobalVariable hash |
    c.getArg(0) = f and c.getFunction().(NameNode).uses(hash) and hash.getId() = "hash"
  )
}

predicate unhashable_subscript(ControlFlowNode f, ClassValue c, ControlFlowNode origin) {
  is_unhashable(f, c, origin) and
  exists(SubscriptNode sub | sub.getIndex() = f |
    exists(Value custom_getitem |
      sub.getObject().pointsTo(custom_getitem) and
      not has_custom_getitem(custom_getitem)
    )
  )
}

predicate is_unhashable(ControlFlowNode f, ClassValue cls, ControlFlowNode origin) {
  exists(Value v | f.pointsTo(v, origin) and v.getClass() = cls |
    not cls.hasAttribute("__hash__") and not cls.failedInference(_) and cls.isNewStyle()
    or
    cls.lookup("__hash__") = Value::named("None")
  )
}

/**
 * Holds if `f` is inside a `try` that catches `TypeError`. For example:
 *
 *    try:
 *       ... f ...
 *    except TypeError:
 *       ...
 *
 * This predicate is used to eliminate false positive results. If `hash`
 * is called on an unhashable object then a `TypeError` will be thrown.
 * But this is not a bug if the code catches the `TypeError` and handles
 * it.
 */
predicate typeerror_is_caught(ControlFlowNode f) {
  exists(Try try |
    try.getBody().contains(f.getNode()) and
    try.getAHandler().getType().pointsTo(ClassValue::typeError())
  )
}

from ControlFlowNode f, ClassValue c, ControlFlowNode origin
where
  not typeerror_is_caught(f) and
  (
    explicitly_hashed(f) and is_unhashable(f, c, origin)
    or
    unhashable_subscript(f, c, origin)
  )
select f.getNode(), "This $@ of $@ is unhashable.", origin, "instance", c, c.getQualifiedName()