From 8f152a5bfb2cc624db4644f608144eb09ff87ce8 Mon Sep 17 00:00:00 2001 From: Rasmus Lerchedahl Petersen Date: Mon, 13 Sep 2021 16:50:00 +0200 Subject: [PATCH] Python: Port regex concepts and adapt PolyRedos --- python/ql/lib/semmle/python/Concepts.qll | 46 ++++++ .../lib/semmle/python/frameworks/Stdlib.qll | 152 ++++++++++++++++++ .../PolynomialReDoSCustomizations.qll | 136 +--------------- 3 files changed, 199 insertions(+), 135 deletions(-) diff --git a/python/ql/lib/semmle/python/Concepts.qll b/python/ql/lib/semmle/python/Concepts.qll index 5517347e692..95bf56d11df 100644 --- a/python/ql/lib/semmle/python/Concepts.qll +++ b/python/ql/lib/semmle/python/Concepts.qll @@ -355,6 +355,41 @@ module SqlExecution { } } +/** + * A data-flow node that executes a regular expression. + * + * Extend this class to refine existing API models. If you want to model new APIs, + * extend `RegexExecution::Range` instead. + */ +class RegexExecution extends DataFlow::Node { + RegexExecution::Range range; + + RegexExecution() { this = range } + + /** Gets the data flow node for the regex being compiled by this node. */ + DataFlow::Node getRegexNode() { result = range.getRegexNode() } + + /** Gets a dataflow node for the string to be searched or matched against. */ + DataFlow::Node getString() { result = range.getString() } +} + +/** Provides classes for modeling new regular-expression execution APIs. */ +module RegexExecution { + /** + * A data-flow node that executes a regular expression. + * + * Extend this class to model new APIs. If you want to refine existing API models, + * extend `RegexExecution` instead. + */ + abstract class Range extends DataFlow::Node { + /** Gets the data flow node for the regex being compiled by this node. */ + abstract DataFlow::Node getRegexNode(); + + /** Gets a dataflow node for the string to be searched or matched against. */ + abstract DataFlow::Node getString(); + } +} + /** * A data-flow node that escapes meta-characters, which could be used to prevent * injection attacks. @@ -411,6 +446,9 @@ module Escaping { /** Gets the escape-kind for escaping a string so it can safely be included in HTML. */ string getHtmlKind() { result = "html" } + + /** Gets the escape-kind for escaping a string so it can safely be included in HTML. */ + string getRegexKind() { result = "regex" } // TODO: If adding an XML kind, update the modeling of the `MarkupSafe` PyPI package. // // Technically it claims to escape for both HTML and XML, but for now we don't have @@ -427,6 +465,14 @@ class HtmlEscaping extends Escaping { HtmlEscaping() { range.getKind() = Escaping::getHtmlKind() } } +/** + * An escape of a string so it can be safely included in + * the body of a regex. + */ +class RegexEscaping extends Escaping { + RegexEscaping() { range.getKind() = Escaping::getRegexKind() } +} + /** Provides classes for modeling HTTP-related APIs. */ module HTTP { import semmle.python.web.HttpConstants diff --git a/python/ql/lib/semmle/python/frameworks/Stdlib.qll b/python/ql/lib/semmle/python/frameworks/Stdlib.qll index 539f0dcabb0..d85e2408453 100644 --- a/python/ql/lib/semmle/python/frameworks/Stdlib.qll +++ b/python/ql/lib/semmle/python/frameworks/Stdlib.qll @@ -1497,6 +1497,158 @@ private module StdlibPrivate { } } +// --------------------------------------------------------------------------- +// re +// --------------------------------------------------------------------------- +/** + * List of methods in the `re` module immediately executing a regular expression. + * + * See https://docs.python.org/3/library/re.html#module-contents + */ +private class RegexExecutionMethod extends string { + RegexExecutionMethod() { + this in ["match", "fullmatch", "search", "split", "findall", "finditer", "sub", "subn"] + } +} + +/** Gets the index of the argument representing the string to be searched by a regex. */ +int stringArg(RegexExecutionMethod method) { + method in ["match", "fullmatch", "search", "split", "findall", "finditer"] and + result = 1 + or + method in ["sub", "subn"] and + result = 2 +} + +/** + * A a call to a method from the `re` module immediately executing a regular expression. + * + * See `RegexExecutionMethods` + */ +private class DirectRegex extends DataFlow::CallCfgNode, RegexExecution::Range { + RegexExecutionMethod method; + + DirectRegex() { this = API::moduleImport("re").getMember(method).getACall() } + + override DataFlow::Node getRegexNode() { + result in [this.getArg(0), this.getArgByName("pattern")] + } + + override DataFlow::Node getString() { + result in [this.getArg(stringArg(method)), this.getArgByName("string")] + } +} + +/** Helper module for tracking compiled regexes. */ +private module CompiledRegexes { + private import semmle.python.dataflow.new.DataFlow2 + private import semmle.python.RegexTreeView + + // TODO: This module should be refactored once API graphs are more expressinve. + /** A configuration for finding uses of compiled regexes. */ + class RegexDefinitionConfiguration extends DataFlow2::Configuration { + RegexDefinitionConfiguration() { this = "RegexDefinitionConfiguration" } + + override predicate isSource(DataFlow::Node source) { source instanceof RegexDefinitonSource } + + override predicate isSink(DataFlow::Node sink) { sink instanceof RegexDefinitionSink } + } + + /** A regex compilation. */ + class RegexDefinitonSource extends DataFlow::CallCfgNode { + DataFlow::Node regexNode; + + RegexDefinitonSource() { + this = API::moduleImport("re").getMember("compile").getACall() and + regexNode in [this.getArg(0), this.getArgByName("pattern")] + } + + /** Gets the data flow node for the regex being compiled by this node. */ + DataFlow::Node getRegexNode() { result = regexNode } + } + + /** A use of a compiled regex. */ + class RegexDefinitionSink extends DataFlow::Node { + RegexExecutionMethod method; + DataFlow::CallCfgNode executingCall; + + RegexDefinitionSink() { + executingCall = + API::moduleImport("re").getMember("compile").getReturn().getMember(method).getACall() and + this = executingCall.getFunction().(DataFlow::AttrRead).getObject() + } + + /** Gets the method used to execute the regex. */ + RegexExecutionMethod getMethod() { result = method } + + /** Gets the data flow node for the executing call. */ + DataFlow::CallCfgNode getExecutingCall() { result = executingCall } + } +} + +private import CompiledRegexes + +/** + * A call on compiled regular expression (obtained via `re.compile`) executing a + * regular expression. + * + * Given the following example: + * + * ```py + * pattern = re.compile(input) + * pattern.match(s) + * ``` + * + * This class will identify that `re.compile` compiles `input` and afterwards + * executes `re`'s `match`. As a result, `this` will refer to `pattern.match(s)` + * and `this.getRegexNode()` will return the node for `input` (`re.compile`'s first argument). + * + * + * See `RegexExecutionMethods` + * + * See https://docs.python.org/3/library/re.html#regular-expression-objects + */ +private class CompiledRegex extends DataFlow::CallCfgNode, RegexExecution { + DataFlow::Node regexNode; + RegexExecutionMethod method; + + CompiledRegex() { + exists( + RegexDefinitionConfiguration conf, RegexDefinitonSource source, RegexDefinitionSink sink + | + conf.hasFlow(source, sink) and + regexNode = source.getRegexNode() and + method = sink.getMethod() and + this = sink.getExecutingCall() + ) + } + + override DataFlow::Node getRegexNode() { result = regexNode } + + override DataFlow::Node getString() { + result in [this.getArg(stringArg(method) - 1), this.getArgByName("string")] + } +} + +/** + * A call to 're.escape'. + * See https://docs.python.org/3/library/re.html#re.escape + */ +private class ReEscapeCall extends Escaping::Range, DataFlow::CallCfgNode { + DataFlow::Node regexNode; + + ReEscapeCall() { + this = API::moduleImport("re").getMember("escape").getACall() and + regexNode in [this.getArg(0), this.getArgByName("pattern")] + } + + override DataFlow::Node getAnInput() { result = regexNode } + + override DataFlow::Node getOutput() { result = this } + + override string getKind() { result = Escaping::getRegexKind() } +} + // --------------------------------------------------------------------------- // OTHER // --------------------------------------------------------------------------- diff --git a/python/ql/lib/semmle/python/security/dataflow/PolynomialReDoSCustomizations.qll b/python/ql/lib/semmle/python/security/dataflow/PolynomialReDoSCustomizations.qll index cbaf3b982e9..33d88c5d0ed 100644 --- a/python/ql/lib/semmle/python/security/dataflow/PolynomialReDoSCustomizations.qll +++ b/python/ql/lib/semmle/python/security/dataflow/PolynomialReDoSCustomizations.qll @@ -60,7 +60,7 @@ module PolynomialReDoS { RegExpTerm t; RegexExecutionAsSink() { - exists(CompiledRegexes::RegexExecution re | + exists(RegexExecution re | re.getRegexNode().asExpr() = t.getRegex() and this = re.getString() ) and @@ -76,137 +76,3 @@ module PolynomialReDoS { */ class StringConstCompareAsSanitizerGuard extends SanitizerGuard, StringConstCompare { } } - -/** Helper module for tracking compiled regexes. */ -private module CompiledRegexes { - // TODO: This module should be refactored and merged with the experimental work done on detecting - // regex injections, such that this can be expressed from just using a concept. - /** A configuration for finding uses of compiled regexes. */ - class RegexDefinitionConfiguration extends DataFlow2::Configuration { - RegexDefinitionConfiguration() { this = "RegexDefinitionConfiguration" } - - override predicate isSource(DataFlow::Node source) { source instanceof RegexDefinitonSource } - - override predicate isSink(DataFlow::Node sink) { sink instanceof RegexDefinitionSink } - } - - /** A regex compilation. */ - class RegexDefinitonSource extends DataFlow::CallCfgNode { - DataFlow::Node regexNode; - - RegexDefinitonSource() { - this = API::moduleImport("re").getMember("compile").getACall() and - regexNode in [this.getArg(0), this.getArgByName("pattern")] - } - - /** Gets the regex that is being compiled by this node. */ - RegExpTerm getRegExp() { result.getRegex() = regexNode.asExpr() and result.isRootTerm() } - - /** Gets the data flow node for the regex being compiled by this node. */ - DataFlow::Node getRegexNode() { result = regexNode } - } - - /** A use of a compiled regex. */ - class RegexDefinitionSink extends DataFlow::Node { - RegexExecutionMethod method; - DataFlow::CallCfgNode executingCall; - - RegexDefinitionSink() { - exists(DataFlow::AttrRead reMethod | - executingCall.getFunction() = reMethod and - reMethod.getAttributeName() = method and - this = reMethod.getObject() - ) - } - - /** Gets the method used to execute the regex. */ - RegexExecutionMethod getMethod() { result = method } - - /** Gets the data flow node for the executing call. */ - DataFlow::CallCfgNode getExecutingCall() { result = executingCall } - } - - /** A data flow node executing a regex. */ - abstract class RegexExecution extends DataFlow::Node { - /** Gets the data flow node for the regex being compiled by this node. */ - abstract DataFlow::Node getRegexNode(); - - /** Gets a dataflow node for the string to be searched or matched against. */ - abstract DataFlow::Node getString(); - } - - private class RegexExecutionMethod extends string { - RegexExecutionMethod() { - this in ["match", "fullmatch", "search", "split", "findall", "finditer", "sub", "subn"] - } - } - - /** Gets the index of the argument representing the string to be searched by a regex. */ - int stringArg(RegexExecutionMethod method) { - method in ["match", "fullmatch", "search", "split", "findall", "finditer"] and - result = 1 - or - method in ["sub", "subn"] and - result = 2 - } - - /** - * A class to find `re` methods immediately executing an expression. - * - * See `RegexExecutionMethods` - */ - class DirectRegex extends DataFlow::CallCfgNode, RegexExecution { - RegexExecutionMethod method; - - DirectRegex() { this = API::moduleImport("re").getMember(method).getACall() } - - override DataFlow::Node getRegexNode() { - result in [this.getArg(0), this.getArgByName("pattern")] - } - - override DataFlow::Node getString() { - result in [this.getArg(stringArg(method)), this.getArgByName("string")] - } - } - - /** - * A class to find `re` methods immediately executing a compiled expression by `re.compile`. - * - * Given the following example: - * - * ```py - * pattern = re.compile(input) - * pattern.match(s) - * ``` - * - * This class will identify that `re.compile` compiles `input` and afterwards - * executes `re`'s `match`. As a result, `this` will refer to `pattern.match(s)` - * and `this.getRegexNode()` will return the node for `input` (`re.compile`'s first argument) - * - * - * See `RegexExecutionMethods` - * - * See https://docs.python.org/3/library/re.html#regular-expression-objects - */ - private class CompiledRegex extends DataFlow::CallCfgNode, RegexExecution { - DataFlow::Node regexNode; - RegexExecutionMethod method; - - CompiledRegex() { - exists( - RegexDefinitionConfiguration conf, RegexDefinitonSource source, RegexDefinitionSink sink - | - conf.hasFlow(source, sink) and - regexNode = source.getRegexNode() and - method = sink.getMethod() and - this = sink.getExecutingCall() - ) - } - - override DataFlow::Node getRegexNode() { result = regexNode } - - override DataFlow::Node getString() { - result in [this.getArg(stringArg(method) - 1), this.getArgByName("string")] - } - } -}