/** * Provides an extension point for modeling sensitive data, such as secrets, certificates, or passwords. * Sensitive data can be interesting to use as data-flow sources in security queries. */ private import python private import semmle.python.dataflow.new.DataFlow // Need to import `semmle.python.Frameworks` since frameworks can extend `SensitiveDataSource::Range` private import semmle.python.Frameworks private import codeql.concepts.internal.SensitiveDataHeuristics as SensitiveDataHeuristics private import semmle.python.ApiGraphs // We export these explicitly, so we don't also export the `HeuristicNames` module. class SensitiveDataClassification = SensitiveDataHeuristics::SensitiveDataClassification; module SensitiveDataClassification = SensitiveDataHeuristics::SensitiveDataClassification; /** * A data flow source of sensitive data, such as secrets, certificates, or passwords. * * Extend this class to refine existing API models. If you want to model new APIs, * extend `SensitiveDataSource::Range` instead. */ class SensitiveDataSource extends DataFlow::Node instanceof SensitiveDataSource::Range { SensitiveDataSource() { // ignore sensitive password sources in getpass.py, that can escape through `getpass.getpass()` return value, // since `getpass.getpass()` is considered a source itself. not exists(Module getpass | getpass.getName() = "getpass" and this.getScope().getEnclosingModule() = getpass and // do allow this call if we're analyzing getpass.py as part of CPython though not exists(getpass.getFile().getRelativePath()) ) } /** * Gets the classification of the sensitive data. */ SensitiveDataClassification getClassification() { result = super.getClassification() } } /** Provides a class for modeling new sources of sensitive data, such as secrets, certificates, or passwords. */ module SensitiveDataSource { /** * A data flow source of sensitive data, such as secrets, certificates, or passwords. * * Extend this class to model new APIs. If you want to refine existing API models, * extend `SensitiveDataSource` instead. */ abstract class Range extends DataFlow::Node { /** * Gets the classification of the sensitive data. */ abstract SensitiveDataClassification getClassification(); } } /** Actual sensitive data modeling */ private module SensitiveDataModeling { private import SensitiveDataHeuristics::HeuristicNames /** * Gets a reference to a function that is considered to be a sensitive source of * `classification`. */ private DataFlow::TypeTrackingNode sensitiveFunction( DataFlow::TypeTracker t, SensitiveDataClassification classification ) { t.start() and exists(Function f | f.getName() = sensitiveString(classification) and result.asExpr() = f.getDefinition() ) or exists(DataFlow::TypeTracker t2 | result = sensitiveFunction(t2, classification).track(t2, t)) } /** * Gets a reference to a function that is considered to be a sensitive source of * `classification`. */ DataFlow::Node sensitiveFunction(SensitiveDataClassification classification) { sensitiveFunction(DataFlow::TypeTracker::end(), classification).flowsTo(result) } /** * Gets a reference (in local scope) to a string constant that, if used as the key in * a lookup, indicates the presence of sensitive data with `classification`. */ DataFlow::Node sensitiveLookupStringConst(SensitiveDataClassification classification) { // Note: If this is implemented with type-tracking, we will get cross-talk as // illustrated in python/ql/test/library-tests/dataflow/sensitive-data/test.py exists(DataFlow::LocalSourceNode source | source.asExpr().(StringLiteral).getText() = sensitiveString(classification) and source.flowsTo(result) ) } /** A function call that is considered a source of sensitive data. */ class SensitiveFunctionCall extends SensitiveDataSource::Range, DataFlow::CallCfgNode { SensitiveDataClassification classification; SensitiveFunctionCall() { this.getFunction() = sensitiveFunction(classification) or // to cover functions that we don't have the definition for, and where the // reference to the function has not already been marked as being sensitive this.getFunction().asCfgNode().(NameNode).getId() = sensitiveString(classification) } override SensitiveDataClassification getClassification() { result = classification } } /** * Tracks any modeled source of sensitive data (with any classification), * to limit the scope of `extraStepForCalls`. See it's QLDoc for more context. * * Also see `extraStepForCalls`. */ private DataFlow::TypeTrackingNode possibleSensitiveCallable(DataFlow::TypeTracker t) { t.start() and result instanceof SensitiveDataSource or exists(DataFlow::TypeTracker t2 | result = possibleSensitiveCallable(t2).track(t2, t)) } /** * Tracks any modeled source of sensitive data (with any classification), * to limit the scope of `extraStepForCalls`. See it's QLDoc for more context. * * Also see `extraStepForCalls`. */ private DataFlow::Node possibleSensitiveCallable() { possibleSensitiveCallable(DataFlow::TypeTracker::end()).flowsTo(result) } /** * Holds if the step from `nodeFrom` to `nodeTo` should be considered a * taint-flow step for sensitive-data, to ensure calls are handled correctly. * * To handle calls properly, while preserving a good source for path explanations, * you need to include this predicate as an additional taint step in your taint-tracking * configurations. * * The core problem can be illustrated by the example below. If we consider the * `print` a sink, what path and what source do we want to show? My initial approach * would be to use type-tracking to propagate from the `not_found.get_passwd` attribute * lookup, to the use of `non_sensitive_name`, and then create a new `SensitiveDataSource::Range` * like `SensitiveFunctionCall`. Although that seems likely to work, it will also end up * with a non-optimal path, which starts at _bad source_, and therefore doesn't show * how we figured out that `non_sensitive_name` * could be a function that returns a password (and in cases where there is many calls to * `my_func` it will be annoying for someone to figure this out manually). * * By including this additional taint-step in the taint-tracking configuration, it's possible * to get a path explanation going from _good source_ to the sink. * * ```python * def my_func(non_sensitive_name): * x = non_sensitive_name() # <-- bad source * print(x) # <-- sink * * import not_found * f = not_found.get_passwd # <-- good source * my_func(f) * ``` */ predicate extraStepForCalls(DataFlow::Node nodeFrom, DataFlow::CallCfgNode nodeTo) { // However, we do still use the type-tracking approach to limit the size of this // predicate. nodeTo.getFunction() = nodeFrom and nodeFrom = possibleSensitiveCallable() } pragma[nomagic] private string sensitiveStringLiteralCandidate() { result = any(StringLiteral s | not s.isDocString()).getText() and not result.regexpMatch(notSensitiveRegexp()) } pragma[nomagic] private string sensitiveAttributeNameCandidate() { result = any(DataFlow::AttrRead a).getAttributeName() and not result.regexpMatch(notSensitiveRegexp()) } pragma[nomagic] private string sensitiveParameterNameCandidate() { result = any(Parameter p).getName() and not result.regexpMatch(notSensitiveRegexp()) } pragma[nomagic] private string sensitiveFunctionNameCandidate() { result = any(Function f).getName() and not result.regexpMatch(notSensitiveRegexp()) } pragma[nomagic] private string sensitiveNameCandidate() { result = any(Name n).getId() and not result.regexpMatch(notSensitiveRegexp()) } /** * This helper predicate serves to deduplicate the results of the preceding predicates. This * means that if, say, an attribute and a function parameter have the same name, then that name will * only be matched once, which greatly cuts down on the number of regexp matches that have to be * performed. * * Under normal circumstances, deduplication is only performed when a predicate is materialized, and * so to see the effect of this we must create a separate predicate that calculates the union of the * preceding predicates. */ pragma[nomagic] private string sensitiveStringCandidate() { result in [ sensitiveNameCandidate(), sensitiveAttributeNameCandidate(), sensitiveParameterNameCandidate(), sensitiveFunctionNameCandidate(), sensitiveStringLiteralCandidate() ] } /** * Returns strings (primarily the names of various program entities) that may contain sensitive data * with the classification `classification`. * * This helper predicate ends up being very similar to `nameIndicatesSensitiveData`, * but is performance optimized to limit the number of regexp matches that have to be performed. */ pragma[nomagic] private string sensitiveString(SensitiveDataClassification classification) { result = sensitiveStringCandidate() and result.regexpMatch(maybeSensitiveRegexp(classification)) } /** * A variable assignment (also including with/for) where the name indicates * it contains sensitive data. * * Note: We _could_ make any access to a variable with a sensitive name a source of * sensitive data, but to make path explanations in data-flow/taint-tracking good, * we don't want that, since it works against allowing users to understand the flow * in the program (which is the whole point). * * Note: To make data-flow/taint-tracking work, the expression that is _assigned_ to * the variable is marked as the source (as compared to marking the variable as the * source). */ class SensitiveVariableAssignment extends SensitiveDataSource::Range { SensitiveDataClassification classification; SensitiveVariableAssignment() { exists(DefinitionNode def | def.(NameNode).getId() = sensitiveString(classification) and ( this.asCfgNode() = def.getValue() or this.asCfgNode() = def.getValue().(ForNode).getSequence() ) and not this.asExpr() instanceof FunctionExpr and not this.asExpr() instanceof ClassExpr ) or exists(With with | with.getOptionalVars().(Name).getId() = sensitiveString(classification) and this.asExpr() = with.getContextExpr() ) } override SensitiveDataClassification getClassification() { result = classification } } /** An attribute access that is considered a source of sensitive data. */ class SensitiveAttributeAccess extends SensitiveDataSource::Range { SensitiveDataClassification classification; SensitiveAttributeAccess() { // Things like `foo.` or `from import ` // I considered excluding any `from ... import something_sensitive`, but then realized that // we should flag up `form ... import password as ...` as a password this.(DataFlow::AttrRead).getAttributeName() = sensitiveString(classification) or // Things like `getattr(foo, )` this.(DataFlow::AttrRead).getAttributeNameExpr() = sensitiveLookupStringConst(classification) } override SensitiveDataClassification getClassification() { result = classification } } /** A subscript, where the key indicates the result will be sensitive data. */ class SensitiveSubscript extends SensitiveDataSource::Range { SensitiveDataClassification classification; SensitiveSubscript() { this.asCfgNode().(SubscriptNode).getIndex() = sensitiveLookupStringConst(classification).asCfgNode() } override SensitiveDataClassification getClassification() { result = classification } } /** A call to `get` on an object, where the key indicates the result will be sensitive data. */ class SensitiveGetCall extends SensitiveDataSource::Range, DataFlow::CallCfgNode { SensitiveDataClassification classification; SensitiveGetCall() { this.getFunction().(DataFlow::AttrRef).getAttributeName() = "get" and this.getArg(0) = sensitiveLookupStringConst(classification) } override SensitiveDataClassification getClassification() { result = classification } } /** A parameter where the name indicates it will receive sensitive data. */ class SensitiveParameter extends SensitiveDataSource::Range, DataFlow::ParameterNode { SensitiveDataClassification classification; SensitiveParameter() { this.getParameter().getName() = sensitiveString(classification) } override SensitiveDataClassification getClassification() { result = classification } } /** * A call to `getpass.getpass`, see https://docs.python.org/3.10/library/getpass.html#getpass.getpass */ class GetPassCall extends SensitiveDataSource::Range, API::CallNode { GetPassCall() { this = API::moduleImport("getpass").getMember("getpass").getACall() } override SensitiveDataClassification getClassification() { result = SensitiveDataClassification::password() } } } predicate sensitiveDataExtraStepForCalls = SensitiveDataModeling::extraStepForCalls/2; predicate sensitiveLookupStringConst = SensitiveDataModeling::sensitiveLookupStringConst/1;