From 80bf38d8ccdbf079f0ca4bc79293da44fcc67286 Mon Sep 17 00:00:00 2001 From: Benjamin Rodes Date: Tue, 6 Feb 2024 14:20:01 -0500 Subject: [PATCH] Initial working draft of non-const source refactor. --- .../Likely Bugs/Format/NonConstantFormat.ql | 143 +++++++----------- .../NonConstantFormat/NonConstantFormat.c | 9 +- .../Format/NonConstantFormat/nested.cpp | 4 +- .../Format/NonConstantFormat/test.cpp | 124 ++++++++++++--- 4 files changed, 164 insertions(+), 116 deletions(-) diff --git a/cpp/ql/src/Likely Bugs/Format/NonConstantFormat.ql b/cpp/ql/src/Likely Bugs/Format/NonConstantFormat.ql index 5fd1c861f98..434cd85c655 100644 --- a/cpp/ql/src/Likely Bugs/Format/NonConstantFormat.ql +++ b/cpp/ql/src/Likely Bugs/Format/NonConstantFormat.ql @@ -18,115 +18,80 @@ import semmle.code.cpp.ir.dataflow.TaintTracking import semmle.code.cpp.commons.Printf import semmle.code.cpp.security.FlowSources +import semmle.code.cpp.ir.dataflow.internal.ModelUtil +import semmle.code.cpp.models.interfaces.DataFlow +import semmle.code.cpp.models.interfaces.Taint +import semmle.code.cpp.ir.implementation.raw.Instruction class UncalledFunction extends Function { - UncalledFunction() { not exists(Call c | c.getTarget() = this) } + UncalledFunction() { + not exists(Call c | c.getTarget() = this) and + not this.(MemberFunction).overrides(_) + } } -// For the following `...gettext` functions, we assume that -// all translations preserve the type and order of `%` specifiers -// (and hence are safe to use as format strings). This -// assumption is hard-coded into the query. -predicate whitelistFunction(Function f, int arg) { - // basic variations of gettext - f.getName() = "_" and arg = 0 +/** + * Holds if `node` is a non-constant source of data flow. + * This is defined as either: + * 1) a `FlowSource` + * 2) a parameter of an 'uncalled' function (i.e., a function that is not called in the program) + * 3) an argument to a function with no definition that is not known to define the output through its input + * 4) an out arg of a function with no definition that is not known to define the output through its input + * + * The latter two cases address identifying standard string manipulation libraries as input sources + * e.g., strcpy, but it will identify unknown function calls as possible non-constant source + * since it cannot be determined if the out argument or return is constant. + */ +predicate isNonConst(DataFlow::Node node) { + node instanceof FlowSource or - f.getName() = "gettext" and arg = 0 + exists(UncalledFunction f | f.getAParameter() = node.asParameter()) or - f.getName() = "dgettext" and arg = 1 - or - f.getName() = "dcgettext" and arg = 1 - or - // plural variations of gettext that take one format string for singular and another for plural form - f.getName() = "ngettext" and - (arg = 0 or arg = 1) - or - f.getName() = "dngettext" and - (arg = 1 or arg = 2) - or - f.getName() = "dcngettext" and - (arg = 1 or arg = 2) -} - -// we assume that ALL uses of the `_` macro -// return constant string literals -predicate underscoreMacro(Expr e) { - exists(MacroInvocation mi | - mi.getMacroName() = "_" and - mi.getExpr() = e + // Consider as an input any out arg of a function or a function's return where the function is not: + // 1. a function with a known dataflow or taintflow from input to output and the `node` is the output + // 2. a function where there is a known definition + // i.e., functions that with unknown bodies and are not known to define the output through its input + // are considered as possible non-const sources + ( + node instanceof DataFlow::DefinitionByReferenceNode or + node.asIndirectExpr() instanceof Call + ) and + not exists(Function func, FunctionInput input, FunctionOutput output, CallInstruction call | + // NOTE: we must include dataflow and taintflow. e.g., including only dataflow we will find sprintf + // variant function's output are now possible non-const sources + ( + func.(DataFlowFunction).hasDataFlow(input, output) or + func.(TaintFunction).hasTaintFlow(input, output) + ) and + node = callOutput(call, output) and + call.getStaticCallTarget() = func + ) and + not exists(Call c | + c.getTarget().hasDefinition() and + if node instanceof DataFlow::DefinitionByReferenceNode + then c.getAnArgument() = node.asDefiningArgument() + else c = [node.asExpr(), node.asIndirectExpr()] ) } /** - * Holds if `t` cannot hold a character array, directly or indirectly. + * Holds if `sink` is a sink is a format string of any + * `FormattingFunctionCall`. */ -predicate cannotContainString(Type t, boolean isIndirect) { - isIndirect = false and - exists(Type unspecified | - unspecified = t.getUnspecifiedType() and - not unspecified instanceof UnknownType - | - unspecified instanceof BuiltInType or - unspecified instanceof IntegralOrEnumType - ) -} - -predicate isNonConst(DataFlow::Node node, boolean isIndirect) { - exists(Expr e | - e = node.asExpr() and isIndirect = false - or - e = node.asIndirectExpr() and isIndirect = true - | - exists(FunctionCall fc | fc = e | - not ( - whitelistFunction(fc.getTarget(), _) or - fc.getTarget().hasDefinition() - ) - ) - or - exists(Variable v | v = e.(VariableAccess).getTarget() | - v.getType().(ArrayType).getBaseType() instanceof CharType and - exists(AssignExpr ae | - ae.getLValue().(ArrayExpr).getArrayBase().(VariableAccess).getTarget() = v - ) - ) - or - exists(UncalledFunction f, Parameter p | f.getAParameter() = p | - p = e.(VariableAccess).getTarget() - ) - or - node instanceof FlowSource - or - node instanceof DataFlow::DefinitionByReferenceNode and - not exists(FormattingFunctionCall fc | node.asDefiningArgument() = fc.getOutputArgument(_)) and - not exists(Call c | - c.getAnArgument() = node.asDefiningArgument() and c.getTarget().hasDefinition() - ) - ) - or - node instanceof DataFlow::DefinitionByReferenceNode and - isIndirect = true -} - -pragma[noinline] -predicate isBarrierNode(DataFlow::Node node) { - underscoreMacro([node.asExpr(), node.asIndirectExpr()]) - or - exists(node.asExpr()) and - cannotContainString(node.getType(), false) -} - predicate isSinkImpl(DataFlow::Node sink, Expr formatString) { [sink.asExpr(), sink.asIndirectExpr()] = formatString and exists(FormattingFunctionCall fc | formatString = fc.getArgument(fc.getFormatParameterIndex())) } module NonConstFlowConfig implements DataFlow::ConfigSig { - predicate isSource(DataFlow::Node source) { isNonConst(source, _) } + predicate isSource(DataFlow::Node source) { isNonConst(source) } predicate isSink(DataFlow::Node sink) { isSinkImpl(sink, _) } - predicate isBarrier(DataFlow::Node node) { isBarrierNode(node) } + predicate isBarrier(DataFlow::Node node) { + // Ignore tracing non-const through array indices + exists(ArrayExpr a | a.getArrayOffset() = node.asExpr()) + } } module NonConstFlow = TaintTracking::Global; diff --git a/cpp/ql/test/query-tests/Likely Bugs/Format/NonConstantFormat/NonConstantFormat.c b/cpp/ql/test/query-tests/Likely Bugs/Format/NonConstantFormat/NonConstantFormat.c index c6e3ac55356..d7b60aebe88 100644 --- a/cpp/ql/test/query-tests/Likely Bugs/Format/NonConstantFormat/NonConstantFormat.c +++ b/cpp/ql/test/query-tests/Likely Bugs/Format/NonConstantFormat/NonConstantFormat.c @@ -23,7 +23,7 @@ extern char *dcngettext (const char *__domainname, const char *__msgid1, extern char *any_random_function(const char *); #define NULL ((void*)0) -#define _(X) any_random_function((X)) +#define _(X) gettext(X) int main(int argc, char **argv) { if(argc > 1) @@ -40,10 +40,9 @@ int main(int argc, char **argv) { printf(gettext("%d arguments\n"), argc-1); // GOOD printf(any_random_function("%d arguments\n"), argc-1); // BAD - // Even though `_` is mapped to `some_random_function` above, - // the following call should not be flagged. - printf(_(any_random_function("%d arguments\n")), - argc-1); // GOOD + + + printf(_(any_random_function("%d arguments\n")), argc-1); // BAD return 0; } diff --git a/cpp/ql/test/query-tests/Likely Bugs/Format/NonConstantFormat/nested.cpp b/cpp/ql/test/query-tests/Likely Bugs/Format/NonConstantFormat/nested.cpp index de9daae32e2..c81f9af4669 100644 --- a/cpp/ql/test/query-tests/Likely Bugs/Format/NonConstantFormat/nested.cpp +++ b/cpp/ql/test/query-tests/Likely Bugs/Format/NonConstantFormat/nested.cpp @@ -18,7 +18,7 @@ extern "C" int snprintf ( char * s, int n, const char * format, ... ); struct A { void do_print(const char *fmt0) { char buf[32]; - snprintf(buf, 32, fmt0); // GOOD [FALSE POSITIVE] + snprintf(buf, 32, fmt0); // BAD through call from c.do_some_printing(c.ext_fmt_str()) } }; @@ -39,7 +39,7 @@ struct C { void foo(void) { C c; - c.do_some_printing(c.ext_fmt_str()); // BAD [NOT DETECTED] + c.do_some_printing(c.ext_fmt_str()); } struct some_class { diff --git a/cpp/ql/test/query-tests/Likely Bugs/Format/NonConstantFormat/test.cpp b/cpp/ql/test/query-tests/Likely Bugs/Format/NonConstantFormat/test.cpp index 1fadbf20e45..caf44b8ec39 100644 --- a/cpp/ql/test/query-tests/Likely Bugs/Format/NonConstantFormat/test.cpp +++ b/cpp/ql/test/query-tests/Likely Bugs/Format/NonConstantFormat/test.cpp @@ -54,66 +54,66 @@ int main(int argc, char **argv) { { char hello[] = "hello, World\n"; hello[0] = 'H'; - printf(hello); // BAD + printf(hello); // GOOD printf(_(hello)); // GOOD printf(gettext(hello)); // GOOD - printf(const_wash(hello)); // BAD - printf((hello + 1) + 1); // BAD - printf(+hello); // BAD - printf(*&hello); // BAD - printf(&*hello); // BAD - printf((char*)(void*)+(hello+1) + 1); // BAD + printf(const_wash(hello)); // GOOD + printf((hello + 1) + 1); // GOOD + printf(+hello); // GOOD + printf(*&hello); // GOOD + printf(&*hello); // GOOD + printf((char*)(void*)+(hello+1) + 1); // GOOD } - printf(("Hello, World\n" + 1) + 1); // BAD + printf(("Hello, World\n" + 1) + 1); // GOOD { const char *hello = "Hello, World\n"; - printf(hello + 1); // BAD + printf(hello + 1); // GOOD printf(hello); // GOOD } { const char *hello = "Hello, World\n"; hello += 1; - printf(hello); // BAD + printf(hello); // GOOD } { // Same as above block but using "x = x + 1" syntax const char *hello = "Hello, World\n"; hello = hello + 1; - printf(hello); // BAD + printf(hello); // GOOD } { // Same as above block but using "x++" syntax const char *hello = "Hello, World\n"; hello++; - printf(hello); // BAD + printf(hello); // GOOD } { // Same as above block but using "++x" as subexpression const char *hello = "Hello, World\n"; - printf(++hello); // BAD + printf(++hello); // GOOD } { // Same as above block but through a pointer const char *hello = "Hello, World\n"; const char **p = &hello; (*p)++; - printf(hello); // BAD + printf(hello); // GOOD } { // Same as above block but through a C++ reference const char *hello = "Hello, World\n"; const char *&p = hello; p++; - printf(hello); // BAD [NOT DETECTED] + printf(hello); // GOOD } if (gettext_debug) { - printf(new char[100]); // BAD + printf(new char[100]); // BAD [FALSE NEGATIVE] } { const char *hello = "Hello, World\n"; - const char *const *p = &hello; // harmless reference to const pointer - printf(hello); // GOOD [FALSE POSITIVE] - hello++; // modification comes after use and so does no harm + const char *const *p = &hello; + printf(hello); // GOOD + hello++; } printf(argc > 2 ? "More than one\n" : _("Only one\n")); // GOOD @@ -154,7 +154,7 @@ void print_ith_message() { void fmt_via_strcpy(char *data) { strcpy(data, "some string"); - printf(data); // BAD + printf(data); // GOOD [FALSE POSITIVE: Due to inaccurate dataflow killers] } void fmt_with_assignment() { @@ -163,3 +163,87 @@ void fmt_with_assignment() { x = y = "a"; printf(y); // GOOD } + +void fmt_via_strcpy_bad(char *data) { + char res[100]; + strcpy(res, data); + printf(res); // BAD +} + + +int wprintf(const wchar_t *format,...); +typedef wchar_t *STRSAFE_LPWSTR; +typedef const wchar_t *STRSAFE_LPCWSTR; +typedef unsigned int size_t; + +void StringCchPrintfW( + STRSAFE_LPWSTR pszDest, + size_t cchDest, + STRSAFE_LPCWSTR pszFormat, + ... +); + +void wchar_t_test_good(){ + wchar_t wstr[100]; + StringCchPrintfW(wstr, 100, L"STRING"); // GOOD + + wprintf(wstr); // GOOD +} + +void wchar_t_test_bad(wchar_t* str){ + wchar_t wstr[100]; + StringCchPrintfW(wstr, 100, str); // BAD + + wprintf(wstr); // BAD +} + +const char* get_string(); + +void pointer_arithmetic_test_on_bad_string(){ + { + const char *hello = get_string(); + printf(hello + 1); // BAD + printf(hello); // BAD + } + { + const char *hello = get_string(); + hello += 1; + printf(hello); // BAD + } + { + // Same as above block but using "x = x + 1" syntax + const char *hello = get_string(); + hello = hello + 1; + printf(hello); // BAD + } + { + // Same as above block but using "x++" syntax + const char *hello = get_string(); + hello++; + printf(hello); // BAD + } + { + // Same as above block but using "++x" as subexpression + const char *hello = get_string(); + printf(++hello); // BAD + } + { + // Same as above block but through a pointer + const char *hello = get_string(); + const char **p = &hello; + (*p)++; + printf(hello); // BAD + } + { + // Same as above block but through a C++ reference + const char *hello = get_string(); + const char *&p = hello; + p++; + printf(hello); // BAD + } + { + const char *hello = get_string(); + const char *const *p = &hello; + printf(hello); // BAD + } +} \ No newline at end of file