Compare commits

..

2 Commits

Author SHA1 Message Date
Taus
a7ba98cff7 Python: Eliminate some FPs
Two main variants: cases where `__eq__` is not defined (which inherit
`__hash__` automatically), and frozen dataclasses.
2026-04-09 21:35:05 +00:00
Taus
682178a370 Python: Port HashedButNoHash.ql
This one is a bit more involved. Of note is the fact that it at present
only uses local flow when determining the origin of some value (whereas
the points-to version used global flow). It may be desirable to rewrite
this query to use global data-flow, but this should be done with some
care (as using "all unhashable objects" as the set of sources is
somewhat iffy with respect to performance). For that reason, I'm
sticking to mostly local flow (except for well behaved things like types
and built-ins).
2026-04-08 12:18:51 +00:00
9 changed files with 192 additions and 78 deletions

View File

@@ -12,27 +12,19 @@
*/
import python
private import semmle.python.dataflow.new.internal.DataFlowDispatch
private import LegacyPointsTo
/**
* Gets the `i`th base class of `cls`, if it can be resolved to a user-defined class.
*/
Class getBaseType(Class cls, int i) {
cls.getBase(i) = classTracker(result).asExpr() and
result != cls
ClassObject left_base(ClassObject type, ClassObject base) {
exists(int i | i > 0 and type.getBaseType(i) = base and result = type.getBaseType(i - 1))
}
Class left_base(Class type, Class base) {
exists(int i | i > 0 and getBaseType(type, i) = base and result = getBaseType(type, i - 1))
}
predicate invalid_mro(Class t, Class left, Class right) {
DuckTyping::isNewStyle(t) and
predicate invalid_mro(ClassObject t, ClassObject left, ClassObject right) {
t.isNewStyle() and
left = left_base(t, right) and
left = getADirectSuperclass*(right)
left = right.getAnImproperSuperType()
}
from Class t, Class left, Class right
from ClassObject t, ClassObject left, ClassObject right
where invalid_mro(t, left, right)
select t,
"Construction of class " + t.getName() +

View File

@@ -12,76 +12,137 @@
*/
import python
private import LegacyPointsTo
import semmle.python.dataflow.new.DataFlow
private import semmle.python.dataflow.new.internal.DataFlowDispatch
private import semmle.python.ApiGraphs
/*
* This assumes that any indexing operation where the value is not a sequence or numpy array involves hashing.
* For sequences, the index must be an int, which are hashable, so we don't need to treat them specially.
* For numpy arrays, the index may be a list, which are not hashable and needs to be treated specially.
/**
* Holds if `cls` explicitly sets `__hash__` to `None`, making instances unhashable.
*/
predicate numpy_array_type(ClassValue na) {
exists(ModuleValue np | np.getName() = "numpy" or np.getName() = "numpy.core" |
na.getASuperType() = np.attr("ndarray")
)
predicate setsHashToNone(Class cls) {
DuckTyping::getAnAttributeValue(cls, "__hash__") instanceof None
}
predicate has_custom_getitem(Value v) {
v.getClass().lookup("__getitem__") instanceof PythonFunctionValue
or
numpy_array_type(v.getClass())
}
predicate explicitly_hashed(ControlFlowNode f) {
exists(CallNode c, GlobalVariable hash |
c.getArg(0) = f and c.getFunction().(NameNode).uses(hash) and hash.getId() = "hash"
)
}
predicate unhashable_subscript(ControlFlowNode f, ClassValue c, ControlFlowNode origin) {
is_unhashable(f, c, origin) and
exists(SubscriptNode sub | sub.getIndex() = f |
exists(Value custom_getitem |
sub.getObject().(ControlFlowNodeWithPointsTo).pointsTo(custom_getitem) and
not has_custom_getitem(custom_getitem)
)
)
}
predicate is_unhashable(ControlFlowNodeWithPointsTo f, ClassValue cls, ControlFlowNode origin) {
exists(Value v | f.pointsTo(v, origin) and v.getClass() = cls |
not cls.hasAttribute("__hash__") and not cls.failedInference(_) and cls.isNewStyle()
or
cls.lookup("__hash__") = Value::named("None")
/**
* Holds if `cls` has a `@dataclass` decorator with `frozen=True` or `unsafe_hash=True`,
* which generates a `__hash__` method at runtime.
*/
predicate hasDataclassHash(Class cls) {
exists(DataFlow::CallCfgNode decorator |
decorator = API::moduleImport("dataclasses").getMember("dataclass").getACall() and
decorator.asExpr() = cls.getADecorator() and
decorator.getArgByName(["frozen", "unsafe_hash"]).asExpr().(ImmutableLiteral).booleanValue() =
true
)
}
/**
* Holds if `f` is inside a `try` that catches `TypeError`. For example:
*
* try:
* ... f ...
* except TypeError:
* ...
*
* This predicate is used to eliminate false positive results. If `hash`
* is called on an unhashable object then a `TypeError` will be thrown.
* But this is not a bug if the code catches the `TypeError` and handles
* it.
* Holds if `cls` defines `__eq__` directly (not inherited from `object`),
* or has a `@dataclass` decorator that generates `__eq__` (the default).
*/
predicate typeerror_is_caught(ControlFlowNode f) {
exists(Try try |
try.getBody().contains(f.getNode()) and
try.getAHandler().getType().(ExprWithPointsTo).pointsTo(ClassValue::typeError())
predicate definesEq(Class cls) {
DuckTyping::hasMethod(cls, "__eq__")
or
// @dataclass(...) call generates __eq__ unless eq=False
exists(DataFlow::CallCfgNode decorator |
decorator = API::moduleImport("dataclasses").getMember("dataclass").getACall() and
decorator.asExpr() = cls.getADecorator() and
not decorator.getArgByName("eq").asExpr().(ImmutableLiteral).booleanValue() = false
)
or
// bare @dataclass without arguments also generates __eq__
cls.getADecorator() =
API::moduleImport("dataclasses").getMember("dataclass").getAValueReachableFromSource().asExpr()
}
/**
* Holds if `cls` is a user-defined class whose instances are unhashable.
*
* In Python, a class is unhashable if:
* - It explicitly sets `__hash__ = None`, or
* - It defines `__eq__` (directly or via decorator like `@dataclass`) without
* defining `__hash__`, and doesn't have a decorator that generates `__hash__`.
*/
predicate isUnhashableUserClass(Class cls) {
setsHashToNone(cls)
or
// In Python 3, defining __eq__ without __hash__ makes a class unhashable.
// This rule does not apply in Python 2.
major_version() = 3 and
DuckTyping::isNewStyle(cls) and
definesEq(cls) and
not DuckTyping::hasMethod(cls, "__hash__") and
not hasDataclassHash(cls) and
not DuckTyping::hasUnresolvedBase(getADirectSuperclass*(cls))
}
/**
* Gets the name of a builtin type whose instances are unhashable.
*/
string getUnhashableBuiltinName() { result = ["list", "set", "dict", "bytearray"] }
/**
* Holds if `origin` is a local source node tracking an unhashable instance that
* flows to `node`, with `clsName` describing the class for the alert.
*/
predicate isUnhashable(DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName) {
exists(Class c |
isUnhashableUserClass(c) and
origin = classInstanceTracker(c) and
origin.flowsTo(node) and
clsName = c.getName()
)
or
clsName = getUnhashableBuiltinName() and
origin = API::builtin(clsName).getAnInstance().asSource() and
origin.flowsTo(node)
}
predicate explicitly_hashed(DataFlow::Node node) {
node = API::builtin("hash").getACall().getArg(0)
}
/**
* Holds if the subscript object in `sub[...]` is known to use hashing for indexing,
* i.e. it does not have a custom `__getitem__` that could accept unhashable indices.
*/
predicate subscriptUsesHashing(Subscript sub) {
DataFlow::exprNode(sub.getObject()) =
API::builtin("dict").getAnInstance().getAValueReachableFromSource()
or
exists(Class cls |
classInstanceTracker(cls)
.(DataFlow::LocalSourceNode)
.flowsTo(DataFlow::exprNode(sub.getObject())) and
not DuckTyping::hasMethod(cls, "__getitem__")
)
}
from ControlFlowNode f, ClassValue c, ControlFlowNode origin
where
not typeerror_is_caught(f) and
(
explicitly_hashed(f) and is_unhashable(f, c, origin)
or
unhashable_subscript(f, c, origin)
predicate unhashable_subscript(DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName) {
exists(Subscript sub |
node = DataFlow::exprNode(sub.getIndex()) and
subscriptUsesHashing(sub)
|
isUnhashable(origin, node, clsName)
)
select f.getNode(), "This $@ of $@ is unhashable.", origin, "instance", c, c.getQualifiedName()
}
/**
* Holds if `e` is inside a `try` that catches `TypeError`.
*/
predicate typeerror_is_caught(Expr e) {
exists(Try try |
try.getBody().contains(e) and
try.getAHandler().getType() = API::builtin("TypeError").getAValueReachableFromSource().asExpr()
)
}
from DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName
where
not typeerror_is_caught(node.asExpr()) and
(
explicitly_hashed(node) and isUnhashable(origin, node, clsName)
or
unhashable_subscript(origin, node, clsName)
)
select node, "This $@ of $@ is unhashable.", origin, "instance", origin, clsName

View File

@@ -1 +1 @@
| inconsistent_mro.py:9:1:9:14 | Class Z | Construction of class Z can fail due to invalid method resolution order(MRO) for bases $@ and $@. | inconsistent_mro.py:3:1:3:16 | Class X | X | inconsistent_mro.py:6:1:6:11 | Class Y | Y |
| inconsistent_mro.py:9:1:9:14 | class Z | Construction of class Z can fail due to invalid method resolution order(MRO) for bases $@ and $@. | inconsistent_mro.py:3:1:3:16 | class X | X | inconsistent_mro.py:6:1:6:11 | class Y | Y |

View File

@@ -1 +1,2 @@
| inconsistent_mro.py:9:1:9:14 | Class Z | Construction of class Z can fail due to invalid method resolution order(MRO) for bases $@ and $@. | inconsistent_mro.py:3:1:3:16 | Class X | X | inconsistent_mro.py:6:1:6:11 | Class Y | Y |
| inconsistent_mro.py:9:1:9:14 | class Z | Construction of class Z can fail due to invalid method resolution order(MRO) for bases $@ and $@. | inconsistent_mro.py:3:1:3:16 | class X | X | inconsistent_mro.py:6:1:6:11 | class Y | Y |
| inconsistent_mro.py:16:1:16:19 | class N | Construction of class N can fail due to invalid method resolution order(MRO) for bases $@ and $@. | file://:Compiled Code:0:0:0:0 | builtin-class object | object | inconsistent_mro.py:12:1:12:8 | class O | O |

View File

@@ -0,0 +1,2 @@
| test.py:17:17:17:19 | ControlFlowNode for obj | This $@ of $@ is unhashable. | test.py:16:11:16:23 | ControlFlowNode for HasEqNoHash() | instance | test.py:16:11:16:23 | ControlFlowNode for HasEqNoHash() | HasEqNoHash |
| test.py:48:17:48:17 | ControlFlowNode for p | This $@ of $@ is unhashable. | test.py:47:9:47:27 | ControlFlowNode for MutableDataclass() | instance | test.py:47:9:47:27 | ControlFlowNode for MutableDataclass() | MutableDataclass |

View File

@@ -0,0 +1 @@
Expressions/HashedButNoHash.ql

View File

@@ -0,0 +1,48 @@
# Classes without __eq__ are hashable (inherit __hash__ from object)
class NoEqClass:
def __init__(self, x):
self.x = x
def hash_no_eq():
obj = NoEqClass(1)
return hash(obj) # OK: no __eq__, so __hash__ is inherited from object
# Classes with __eq__ but no __hash__ are unhashable
class HasEqNoHash:
def __eq__(self, other):
return self.x == other.x
def hash_eq_no_hash():
obj = HasEqNoHash()
return hash(obj) # should be flagged
# @dataclass(frozen=True) generates __hash__, so instances are hashable
from dataclasses import dataclass
@dataclass(frozen=True)
class FrozenPoint:
x: int
y: int
def hash_frozen_dataclass():
p = FrozenPoint(1, 2)
return hash(p) # OK: frozen dataclass has __hash__
# @dataclass(unsafe_hash=True) also generates __hash__
@dataclass(unsafe_hash=True)
class UnsafeHashPoint:
x: int
y: int
def hash_unsafe_hash_dataclass():
p = UnsafeHashPoint(1, 2)
return hash(p) # OK: unsafe_hash=True generates __hash__
# Plain @dataclass without frozen/unsafe_hash and with __eq__ is unhashable
@dataclass
class MutableDataclass:
x: int
def hash_mutable_dataclass():
p = MutableDataclass(1)
return hash(p) # should be flagged: @dataclass generates __eq__ but not __hash__

View File

@@ -1 +1 @@
| expressions_test.py:42:20:42:25 | unhash | This $@ of $@ is unhashable. | expressions_test.py:41:32:41:37 | ControlFlowNode for list() | instance | file://:0:0:0:0 | builtin-class list | list |
| expressions_test.py:42:20:42:25 | ControlFlowNode for unhash | This $@ of $@ is unhashable. | expressions_test.py:41:32:41:37 | ControlFlowNode for list() | instance | expressions_test.py:41:32:41:37 | ControlFlowNode for list() | list |

View File

@@ -279,3 +279,12 @@ def useofapply():
def apply(f):
pass
apply(foo)([1])
# NoEqClass is hashable (no __eq__ => inherits __hash__ from object).
class NoEqClass:
def __init__(self, x):
self.x = x
def hash_no_eq():
obj = NoEqClass(1)
return hash(obj) # OK: no __eq__, so __hash__ is inherited from object