Merge pull request #11833 from erik-krogh/trackPyReg

PY: track string-constants to regular expression uses
This commit is contained in:
Erik Krogh Kristensen
2023-02-01 11:40:42 +01:00
committed by GitHub
6 changed files with 4926 additions and 5 deletions

View File

@@ -38,15 +38,30 @@ private API::Node relevant_re_member(string name) {
name != "escape"
}
private import semmle.python.dataflow.new.internal.DataFlowImplForRegExp as RegData
/** A data-flow configuration for tracking string-constants that are used as regular expressions. */
private class RegexTracking extends RegData::Configuration {
RegexTracking() { this = "RegexTracking" }
override predicate isSource(RegData::Node node) {
node.asExpr() instanceof Bytes or
node.asExpr() instanceof Unicode
}
override predicate isSink(RegData::Node node) { used_as_regex_internal(node.asExpr(), _) }
}
/**
* Holds if `s` is used as a regex with the `re` module, with the regex-mode `mode` (if known).
* Holds if the expression `e` is used as a regex with the `re` module, with the regex-mode `mode` (if known).
* If regex mode is not known, `mode` will be `"None"`.
*
* This predicate has not done any data-flow tracking.
*/
predicate used_as_regex(Expr s, string mode) {
(s instanceof Bytes or s instanceof Unicode) and
private predicate used_as_regex_internal(Expr e, string mode) {
/* Call to re.xxx(regex, ... [mode]) */
exists(DataFlow::CallCfgNode call, string name |
call.getArg(0).asExpr() = s and
call.getArg(0).asExpr() = e and
call = relevant_re_member(name).getACall()
|
mode = "None"
@@ -55,6 +70,21 @@ predicate used_as_regex(Expr s, string mode) {
)
}
/**
* Holds if the string-constant `s` ends up being used as a regex with the `re` module, with the regex-mode `mode` (if known).
* If regex mode is not known, `mode` will be `"None"`.
*
* This predicate has done data-flow tracking to find the string-constant that is used as a regex.
*/
predicate used_as_regex(Expr s, string mode) {
(s instanceof Bytes or s instanceof Unicode) and
exists(RegexTracking t, RegData::Node source, RegData::Node sink |
t.hasFlow(source, sink) and
used_as_regex_internal(sink.asExpr(), mode) and
s = source.asExpr()
)
}
/**
* Gets the canonical name for the API graph node corresponding to the `re` flag `flag`. For flags
* that have multiple names, we pick the long-form name as a canonical representative.