mirror of
https://github.com/github/codeql.git
synced 2025-12-18 01:33:15 +01:00
Python: Fix bad join order for sensitive data
Not the prettiest of solutions, but it does the job. Basically, we were
calculating (and re-calculating) the same big relation between strings
and regexes and then checking whether the latter matched the former.
This resulted in tuple counts like the following:
```
[2021-07-12 16:09:24] (12s) Tuple counts for SensitiveDataSources::SensitiveDataModeling::SensitiveVariableAssignment#class#ff#shared/4@7489c6:
4918074 ~0% {4} r1 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH Flow::NameNode::getId_dispred#ff CARTESIAN PRODUCT OUTPUT Lhs.0 'arg0', Lhs.1 'arg1', Rhs.0, Rhs.1 'arg3'
2654 ~0% {4} r2 = JOIN r1 WITH PRIMITIVE regexpMatch#bb ON Lhs.3 'arg3',Lhs.1 'arg1'
return r2
```
(The above being just the bit that handles `DefinitionNode` in
`SensitiveVariableAssignment`, and taking 12 seconds to evaluate.)
By applying a bit of manual inlining and magic, this becomes somewhat
more manageable:
```
[2021-07-12 15:59:44] (1s) Tuple counts for SensitiveDataSources::SensitiveDataModeling::sensitiveString#ff/2@8830e2:
27671 ~2% {3} r1 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveParameterName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0
334012 ~2% {3} r2 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0
361683 ~11% {3} r3 = r1 UNION r2
154644 ~0% {3} r4 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveFunctionName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0
149198 ~1% {3} r5 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveStrConst#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0
124257 ~5% {3} r6 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveAttributeName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0
273455 ~21% {3} r7 = r5 UNION r6
428099 ~30% {3} r8 = r4 UNION r7
789782 ~78% {3} r9 = r3 UNION r8
1121 ~77% {3} r10 = JOIN r9 WITH PRIMITIVE regexpMatch#bb ON Lhs.2 'result',Lhs.1
1121 ~70% {2} r11 = SCAN r10 OUTPUT In.0 'classification', In.2 'result'
return r11
```
(The above being the total for all the sensitive names we care about,
taking only 1.2 seconds to evaluate.)
Incidentally, you may wonder why this has _fewer_ results than before.
The answer is control flow splitting -- every sensitively-named
`DefinitionNode` would have been matched in isolation previously. By
pre-matching on just the names of these, we can subsequently join
against those names that are known to be sensitive, which is a much
faster operation.
(We also get the benefit of deduplicating the strings that are matched,
before actually performing the match, so if, say, an attribute name and
a variable name are identical, then we'll only match them once.)
We also exclude all docstrings as relevant string constants, as these
presumably don't actually flow anywhere.
This commit is contained in:
@@ -60,7 +60,7 @@ private module SensitiveDataModeling {
|
||||
) {
|
||||
t.start() and
|
||||
exists(Function f |
|
||||
nameIndicatesSensitiveData(f.getName(), classification) and
|
||||
f.getName() = sensitiveString(classification) and
|
||||
result.asExpr() = f.getDefinition()
|
||||
)
|
||||
or
|
||||
@@ -83,7 +83,7 @@ private module SensitiveDataModeling {
|
||||
// Note: If this is implemented with type-tracking, we will get cross-talk as
|
||||
// illustrated in python/ql/test/experimental/dataflow/sensitive-data/test.py
|
||||
exists(DataFlow::LocalSourceNode source |
|
||||
nameIndicatesSensitiveData(source.asExpr().(StrConst).getText(), classification) and
|
||||
source.asExpr().(StrConst).getText() = sensitiveString(classification) and
|
||||
source.flowsTo(result)
|
||||
)
|
||||
}
|
||||
@@ -97,7 +97,7 @@ private module SensitiveDataModeling {
|
||||
or
|
||||
// to cover functions that we don't have the definition for, and where the
|
||||
// reference to the function has not already been marked as being sensitive
|
||||
nameIndicatesSensitiveData(this.getFunction().asCfgNode().(NameNode).getId(), classification)
|
||||
this.getFunction().asCfgNode().(NameNode).getId() = sensitiveString(classification)
|
||||
}
|
||||
|
||||
override SensitiveDataClassification getClassification() { result = classification }
|
||||
@@ -164,6 +164,46 @@ private module SensitiveDataModeling {
|
||||
nodeFrom = possibleSensitiveCallable()
|
||||
}
|
||||
|
||||
pragma[nomagic]
|
||||
private string sensitiveStrConstCandidate() {
|
||||
result = any(StrConst s | not s.isDocString()).getText() and
|
||||
not result.regexpMatch(notSensitiveRegexp())
|
||||
}
|
||||
|
||||
pragma[nomagic]
|
||||
private string sensitiveAttributeNameCandidate() {
|
||||
result = any(DataFlow::AttrRead a).getAttributeName() and
|
||||
not result.regexpMatch(notSensitiveRegexp())
|
||||
}
|
||||
|
||||
pragma[nomagic]
|
||||
private string sensitiveParameterNameCandidate() {
|
||||
result = any(Parameter p).getName() and
|
||||
not result.regexpMatch(notSensitiveRegexp())
|
||||
}
|
||||
|
||||
pragma[nomagic]
|
||||
private string sensitiveFunctionNameCandidate() {
|
||||
result = any(Function f).getName() and
|
||||
not result.regexpMatch(notSensitiveRegexp())
|
||||
}
|
||||
|
||||
pragma[nomagic]
|
||||
private string sensitiveNameCandidate() {
|
||||
result = any(Name n).getId() and
|
||||
not result.regexpMatch(notSensitiveRegexp())
|
||||
}
|
||||
|
||||
pragma[nomagic]
|
||||
private string sensitiveString(SensitiveDataClassification classification) {
|
||||
result in [
|
||||
sensitiveNameCandidate(), sensitiveAttributeNameCandidate(),
|
||||
sensitiveParameterNameCandidate(), sensitiveFunctionNameCandidate(),
|
||||
sensitiveStrConstCandidate()
|
||||
] and
|
||||
result.regexpMatch(maybeSensitiveRegexp(classification))
|
||||
}
|
||||
|
||||
/**
|
||||
* Any kind of variable assignment (also including with/for) where the name indicates
|
||||
* it contains sensitive data.
|
||||
@@ -182,7 +222,7 @@ private module SensitiveDataModeling {
|
||||
|
||||
SensitiveVariableAssignment() {
|
||||
exists(DefinitionNode def |
|
||||
nameIndicatesSensitiveData(def.(NameNode).getId(), classification) and
|
||||
def.(NameNode).getId() = sensitiveString(classification) and
|
||||
(
|
||||
this.asCfgNode() = def.getValue()
|
||||
or
|
||||
@@ -193,7 +233,7 @@ private module SensitiveDataModeling {
|
||||
)
|
||||
or
|
||||
exists(With with |
|
||||
nameIndicatesSensitiveData(with.getOptionalVars().(Name).getId(), classification) and
|
||||
with.getOptionalVars().(Name).getId() = sensitiveString(classification) and
|
||||
this.asExpr() = with.getContextExpr()
|
||||
)
|
||||
}
|
||||
@@ -209,7 +249,7 @@ private module SensitiveDataModeling {
|
||||
// Things like `foo.<sensitive-name>` or `from <module> import <sensitive-name>`
|
||||
// I considered excluding any `from ... import something_sensitive`, but then realized that
|
||||
// we should flag up `form ... import password as ...` as a password
|
||||
nameIndicatesSensitiveData(this.(DataFlow::AttrRead).getAttributeName(), classification)
|
||||
this.(DataFlow::AttrRead).getAttributeName() = sensitiveString(classification)
|
||||
or
|
||||
// Things like `getattr(foo, <reference-to-string>)`
|
||||
this.(DataFlow::AttrRead).getAttributeNameExpr() = sensitiveLookupStringConst(classification)
|
||||
@@ -246,9 +286,7 @@ private module SensitiveDataModeling {
|
||||
class SensitiveParameter extends SensitiveDataSource::Range, DataFlow::ParameterNode {
|
||||
SensitiveDataClassification classification;
|
||||
|
||||
SensitiveParameter() {
|
||||
nameIndicatesSensitiveData(this.getParameter().getName(), classification)
|
||||
}
|
||||
SensitiveParameter() { this.getParameter().getName() = sensitiveString(classification) }
|
||||
|
||||
override SensitiveDataClassification getClassification() { result = classification }
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user