Python: Fix bad join order for sensitive data

Not the prettiest of solutions, but it does the job. Basically, we were
calculating (and re-calculating) the same big relation between strings
and regexes and then checking whether the latter matched the former.

This resulted in tuple counts like the following:

```
[2021-07-12 16:09:24] (12s) Tuple counts for SensitiveDataSources::SensitiveDataModeling::SensitiveVariableAssignment#class#ff#shared/4@7489c6:
4918074 ~0%     {4} r1 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH Flow::NameNode::getId_dispred#ff CARTESIAN PRODUCT OUTPUT Lhs.0 'arg0', Lhs.1 'arg1', Rhs.0, Rhs.1 'arg3'
2654    ~0%     {4} r2 = JOIN r1 WITH PRIMITIVE regexpMatch#bb ON Lhs.3 'arg3',Lhs.1 'arg1'
                return r2
```
(The above being just the bit that handles `DefinitionNode` in
`SensitiveVariableAssignment`, and taking 12 seconds to evaluate.)

By applying a bit of manual inlining and magic, this becomes somewhat
more manageable:

```
[2021-07-12 15:59:44] (1s) Tuple counts for SensitiveDataSources::SensitiveDataModeling::sensitiveString#ff/2@8830e2:
27671  ~2%      {3} r1 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveParameterName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0

334012 ~2%      {3} r2 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0

361683 ~11%     {3} r3 = r1 UNION r2

154644 ~0%      {3} r4 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveFunctionName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0

149198 ~1%      {3} r5 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveStrConst#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0

124257 ~5%      {3} r6 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveAttributeName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0

273455 ~21%     {3} r7 = r5 UNION r6
428099 ~30%     {3} r8 = r4 UNION r7
789782 ~78%     {3} r9 = r3 UNION r8
1121   ~77%     {3} r10 = JOIN r9 WITH PRIMITIVE regexpMatch#bb ON Lhs.2 'result',Lhs.1
1121   ~70%     {2} r11 = SCAN r10 OUTPUT In.0 'classification', In.2 'result'
                return r11
```
(The above being the total for all the sensitive names we care about,
taking only 1.2 seconds to evaluate.)

Incidentally, you may wonder why this has _fewer_ results than before.
The answer is control flow splitting -- every sensitively-named
`DefinitionNode` would have been matched in isolation previously. By
pre-matching on just the names of these, we can subsequently join
against those names that are known to be sensitive, which is a much
faster operation.

(We also get the benefit of deduplicating the strings that are matched,
before actually performing the match, so if, say, an attribute name and
a variable name are identical, then we'll only match them once.)

We also exclude all docstrings as relevant string constants, as these
presumably don't actually flow anywhere.
This commit is contained in:
Taus
2021-07-12 16:05:20 +00:00
committed by GitHub
parent a73e382dfe
commit 1decf23785

View File

@@ -60,7 +60,7 @@ private module SensitiveDataModeling {
) {
t.start() and
exists(Function f |
nameIndicatesSensitiveData(f.getName(), classification) and
f.getName() = sensitiveString(classification) and
result.asExpr() = f.getDefinition()
)
or
@@ -83,7 +83,7 @@ private module SensitiveDataModeling {
// Note: If this is implemented with type-tracking, we will get cross-talk as
// illustrated in python/ql/test/experimental/dataflow/sensitive-data/test.py
exists(DataFlow::LocalSourceNode source |
nameIndicatesSensitiveData(source.asExpr().(StrConst).getText(), classification) and
source.asExpr().(StrConst).getText() = sensitiveString(classification) and
source.flowsTo(result)
)
}
@@ -97,7 +97,7 @@ private module SensitiveDataModeling {
or
// to cover functions that we don't have the definition for, and where the
// reference to the function has not already been marked as being sensitive
nameIndicatesSensitiveData(this.getFunction().asCfgNode().(NameNode).getId(), classification)
this.getFunction().asCfgNode().(NameNode).getId() = sensitiveString(classification)
}
override SensitiveDataClassification getClassification() { result = classification }
@@ -164,6 +164,46 @@ private module SensitiveDataModeling {
nodeFrom = possibleSensitiveCallable()
}
pragma[nomagic]
private string sensitiveStrConstCandidate() {
result = any(StrConst s | not s.isDocString()).getText() and
not result.regexpMatch(notSensitiveRegexp())
}
pragma[nomagic]
private string sensitiveAttributeNameCandidate() {
result = any(DataFlow::AttrRead a).getAttributeName() and
not result.regexpMatch(notSensitiveRegexp())
}
pragma[nomagic]
private string sensitiveParameterNameCandidate() {
result = any(Parameter p).getName() and
not result.regexpMatch(notSensitiveRegexp())
}
pragma[nomagic]
private string sensitiveFunctionNameCandidate() {
result = any(Function f).getName() and
not result.regexpMatch(notSensitiveRegexp())
}
pragma[nomagic]
private string sensitiveNameCandidate() {
result = any(Name n).getId() and
not result.regexpMatch(notSensitiveRegexp())
}
pragma[nomagic]
private string sensitiveString(SensitiveDataClassification classification) {
result in [
sensitiveNameCandidate(), sensitiveAttributeNameCandidate(),
sensitiveParameterNameCandidate(), sensitiveFunctionNameCandidate(),
sensitiveStrConstCandidate()
] and
result.regexpMatch(maybeSensitiveRegexp(classification))
}
/**
* Any kind of variable assignment (also including with/for) where the name indicates
* it contains sensitive data.
@@ -182,7 +222,7 @@ private module SensitiveDataModeling {
SensitiveVariableAssignment() {
exists(DefinitionNode def |
nameIndicatesSensitiveData(def.(NameNode).getId(), classification) and
def.(NameNode).getId() = sensitiveString(classification) and
(
this.asCfgNode() = def.getValue()
or
@@ -193,7 +233,7 @@ private module SensitiveDataModeling {
)
or
exists(With with |
nameIndicatesSensitiveData(with.getOptionalVars().(Name).getId(), classification) and
with.getOptionalVars().(Name).getId() = sensitiveString(classification) and
this.asExpr() = with.getContextExpr()
)
}
@@ -209,7 +249,7 @@ private module SensitiveDataModeling {
// Things like `foo.<sensitive-name>` or `from <module> import <sensitive-name>`
// I considered excluding any `from ... import something_sensitive`, but then realized that
// we should flag up `form ... import password as ...` as a password
nameIndicatesSensitiveData(this.(DataFlow::AttrRead).getAttributeName(), classification)
this.(DataFlow::AttrRead).getAttributeName() = sensitiveString(classification)
or
// Things like `getattr(foo, <reference-to-string>)`
this.(DataFlow::AttrRead).getAttributeNameExpr() = sensitiveLookupStringConst(classification)
@@ -246,9 +286,7 @@ private module SensitiveDataModeling {
class SensitiveParameter extends SensitiveDataSource::Range, DataFlow::ParameterNode {
SensitiveDataClassification classification;
SensitiveParameter() {
nameIndicatesSensitiveData(this.getParameter().getName(), classification)
}
SensitiveParameter() { this.getParameter().getName() = sensitiveString(classification) }
override SensitiveDataClassification getClassification() { result = classification }
}