Initial working draft of non-const source refactor.

This commit is contained in:
Benjamin Rodes
2024-02-06 14:20:01 -05:00
parent 5e5fea8ef3
commit 80bf38d8cc
4 changed files with 164 additions and 116 deletions

View File

@@ -18,115 +18,80 @@
import semmle.code.cpp.ir.dataflow.TaintTracking
import semmle.code.cpp.commons.Printf
import semmle.code.cpp.security.FlowSources
import semmle.code.cpp.ir.dataflow.internal.ModelUtil
import semmle.code.cpp.models.interfaces.DataFlow
import semmle.code.cpp.models.interfaces.Taint
import semmle.code.cpp.ir.implementation.raw.Instruction
class UncalledFunction extends Function {
UncalledFunction() { not exists(Call c | c.getTarget() = this) }
UncalledFunction() {
not exists(Call c | c.getTarget() = this) and
not this.(MemberFunction).overrides(_)
}
}
// For the following `...gettext` functions, we assume that
// all translations preserve the type and order of `%` specifiers
// (and hence are safe to use as format strings). This
// assumption is hard-coded into the query.
predicate whitelistFunction(Function f, int arg) {
// basic variations of gettext
f.getName() = "_" and arg = 0
/**
* Holds if `node` is a non-constant source of data flow.
* This is defined as either:
* 1) a `FlowSource`
* 2) a parameter of an 'uncalled' function (i.e., a function that is not called in the program)
* 3) an argument to a function with no definition that is not known to define the output through its input
* 4) an out arg of a function with no definition that is not known to define the output through its input
*
* The latter two cases address identifying standard string manipulation libraries as input sources
* e.g., strcpy, but it will identify unknown function calls as possible non-constant source
* since it cannot be determined if the out argument or return is constant.
*/
predicate isNonConst(DataFlow::Node node) {
node instanceof FlowSource
or
f.getName() = "gettext" and arg = 0
exists(UncalledFunction f | f.getAParameter() = node.asParameter())
or
f.getName() = "dgettext" and arg = 1
or
f.getName() = "dcgettext" and arg = 1
or
// plural variations of gettext that take one format string for singular and another for plural form
f.getName() = "ngettext" and
(arg = 0 or arg = 1)
or
f.getName() = "dngettext" and
(arg = 1 or arg = 2)
or
f.getName() = "dcngettext" and
(arg = 1 or arg = 2)
}
// we assume that ALL uses of the `_` macro
// return constant string literals
predicate underscoreMacro(Expr e) {
exists(MacroInvocation mi |
mi.getMacroName() = "_" and
mi.getExpr() = e
// Consider as an input any out arg of a function or a function's return where the function is not:
// 1. a function with a known dataflow or taintflow from input to output and the `node` is the output
// 2. a function where there is a known definition
// i.e., functions that with unknown bodies and are not known to define the output through its input
// are considered as possible non-const sources
(
node instanceof DataFlow::DefinitionByReferenceNode or
node.asIndirectExpr() instanceof Call
) and
not exists(Function func, FunctionInput input, FunctionOutput output, CallInstruction call |
// NOTE: we must include dataflow and taintflow. e.g., including only dataflow we will find sprintf
// variant function's output are now possible non-const sources
(
func.(DataFlowFunction).hasDataFlow(input, output) or
func.(TaintFunction).hasTaintFlow(input, output)
) and
node = callOutput(call, output) and
call.getStaticCallTarget() = func
) and
not exists(Call c |
c.getTarget().hasDefinition() and
if node instanceof DataFlow::DefinitionByReferenceNode
then c.getAnArgument() = node.asDefiningArgument()
else c = [node.asExpr(), node.asIndirectExpr()]
)
}
/**
* Holds if `t` cannot hold a character array, directly or indirectly.
* Holds if `sink` is a sink is a format string of any
* `FormattingFunctionCall`.
*/
predicate cannotContainString(Type t, boolean isIndirect) {
isIndirect = false and
exists(Type unspecified |
unspecified = t.getUnspecifiedType() and
not unspecified instanceof UnknownType
|
unspecified instanceof BuiltInType or
unspecified instanceof IntegralOrEnumType
)
}
predicate isNonConst(DataFlow::Node node, boolean isIndirect) {
exists(Expr e |
e = node.asExpr() and isIndirect = false
or
e = node.asIndirectExpr() and isIndirect = true
|
exists(FunctionCall fc | fc = e |
not (
whitelistFunction(fc.getTarget(), _) or
fc.getTarget().hasDefinition()
)
)
or
exists(Variable v | v = e.(VariableAccess).getTarget() |
v.getType().(ArrayType).getBaseType() instanceof CharType and
exists(AssignExpr ae |
ae.getLValue().(ArrayExpr).getArrayBase().(VariableAccess).getTarget() = v
)
)
or
exists(UncalledFunction f, Parameter p | f.getAParameter() = p |
p = e.(VariableAccess).getTarget()
)
or
node instanceof FlowSource
or
node instanceof DataFlow::DefinitionByReferenceNode and
not exists(FormattingFunctionCall fc | node.asDefiningArgument() = fc.getOutputArgument(_)) and
not exists(Call c |
c.getAnArgument() = node.asDefiningArgument() and c.getTarget().hasDefinition()
)
)
or
node instanceof DataFlow::DefinitionByReferenceNode and
isIndirect = true
}
pragma[noinline]
predicate isBarrierNode(DataFlow::Node node) {
underscoreMacro([node.asExpr(), node.asIndirectExpr()])
or
exists(node.asExpr()) and
cannotContainString(node.getType(), false)
}
predicate isSinkImpl(DataFlow::Node sink, Expr formatString) {
[sink.asExpr(), sink.asIndirectExpr()] = formatString and
exists(FormattingFunctionCall fc | formatString = fc.getArgument(fc.getFormatParameterIndex()))
}
module NonConstFlowConfig implements DataFlow::ConfigSig {
predicate isSource(DataFlow::Node source) { isNonConst(source, _) }
predicate isSource(DataFlow::Node source) { isNonConst(source) }
predicate isSink(DataFlow::Node sink) { isSinkImpl(sink, _) }
predicate isBarrier(DataFlow::Node node) { isBarrierNode(node) }
predicate isBarrier(DataFlow::Node node) {
// Ignore tracing non-const through array indices
exists(ArrayExpr a | a.getArrayOffset() = node.asExpr())
}
}
module NonConstFlow = TaintTracking::Global<NonConstFlowConfig>;

View File

@@ -23,7 +23,7 @@ extern char *dcngettext (const char *__domainname, const char *__msgid1,
extern char *any_random_function(const char *);
#define NULL ((void*)0)
#define _(X) any_random_function((X))
#define _(X) gettext(X)
int main(int argc, char **argv) {
if(argc > 1)
@@ -40,10 +40,9 @@ int main(int argc, char **argv) {
printf(gettext("%d arguments\n"), argc-1); // GOOD
printf(any_random_function("%d arguments\n"), argc-1); // BAD
// Even though `_` is mapped to `some_random_function` above,
// the following call should not be flagged.
printf(_(any_random_function("%d arguments\n")),
argc-1); // GOOD
printf(_(any_random_function("%d arguments\n")), argc-1); // BAD
return 0;
}

View File

@@ -18,7 +18,7 @@ extern "C" int snprintf ( char * s, int n, const char * format, ... );
struct A {
void do_print(const char *fmt0) {
char buf[32];
snprintf(buf, 32, fmt0); // GOOD [FALSE POSITIVE]
snprintf(buf, 32, fmt0); // BAD through call from c.do_some_printing(c.ext_fmt_str())
}
};
@@ -39,7 +39,7 @@ struct C {
void foo(void) {
C c;
c.do_some_printing(c.ext_fmt_str()); // BAD [NOT DETECTED]
c.do_some_printing(c.ext_fmt_str());
}
struct some_class {

View File

@@ -54,66 +54,66 @@ int main(int argc, char **argv) {
{
char hello[] = "hello, World\n";
hello[0] = 'H';
printf(hello); // BAD
printf(hello); // GOOD
printf(_(hello)); // GOOD
printf(gettext(hello)); // GOOD
printf(const_wash(hello)); // BAD
printf((hello + 1) + 1); // BAD
printf(+hello); // BAD
printf(*&hello); // BAD
printf(&*hello); // BAD
printf((char*)(void*)+(hello+1) + 1); // BAD
printf(const_wash(hello)); // GOOD
printf((hello + 1) + 1); // GOOD
printf(+hello); // GOOD
printf(*&hello); // GOOD
printf(&*hello); // GOOD
printf((char*)(void*)+(hello+1) + 1); // GOOD
}
printf(("Hello, World\n" + 1) + 1); // BAD
printf(("Hello, World\n" + 1) + 1); // GOOD
{
const char *hello = "Hello, World\n";
printf(hello + 1); // BAD
printf(hello + 1); // GOOD
printf(hello); // GOOD
}
{
const char *hello = "Hello, World\n";
hello += 1;
printf(hello); // BAD
printf(hello); // GOOD
}
{
// Same as above block but using "x = x + 1" syntax
const char *hello = "Hello, World\n";
hello = hello + 1;
printf(hello); // BAD
printf(hello); // GOOD
}
{
// Same as above block but using "x++" syntax
const char *hello = "Hello, World\n";
hello++;
printf(hello); // BAD
printf(hello); // GOOD
}
{
// Same as above block but using "++x" as subexpression
const char *hello = "Hello, World\n";
printf(++hello); // BAD
printf(++hello); // GOOD
}
{
// Same as above block but through a pointer
const char *hello = "Hello, World\n";
const char **p = &hello;
(*p)++;
printf(hello); // BAD
printf(hello); // GOOD
}
{
// Same as above block but through a C++ reference
const char *hello = "Hello, World\n";
const char *&p = hello;
p++;
printf(hello); // BAD [NOT DETECTED]
printf(hello); // GOOD
}
if (gettext_debug) {
printf(new char[100]); // BAD
printf(new char[100]); // BAD [FALSE NEGATIVE]
}
{
const char *hello = "Hello, World\n";
const char *const *p = &hello; // harmless reference to const pointer
printf(hello); // GOOD [FALSE POSITIVE]
hello++; // modification comes after use and so does no harm
const char *const *p = &hello;
printf(hello); // GOOD
hello++;
}
printf(argc > 2 ? "More than one\n" : _("Only one\n")); // GOOD
@@ -154,7 +154,7 @@ void print_ith_message() {
void fmt_via_strcpy(char *data) {
strcpy(data, "some string");
printf(data); // BAD
printf(data); // GOOD [FALSE POSITIVE: Due to inaccurate dataflow killers]
}
void fmt_with_assignment() {
@@ -163,3 +163,87 @@ void fmt_with_assignment() {
x = y = "a";
printf(y); // GOOD
}
void fmt_via_strcpy_bad(char *data) {
char res[100];
strcpy(res, data);
printf(res); // BAD
}
int wprintf(const wchar_t *format,...);
typedef wchar_t *STRSAFE_LPWSTR;
typedef const wchar_t *STRSAFE_LPCWSTR;
typedef unsigned int size_t;
void StringCchPrintfW(
STRSAFE_LPWSTR pszDest,
size_t cchDest,
STRSAFE_LPCWSTR pszFormat,
...
);
void wchar_t_test_good(){
wchar_t wstr[100];
StringCchPrintfW(wstr, 100, L"STRING"); // GOOD
wprintf(wstr); // GOOD
}
void wchar_t_test_bad(wchar_t* str){
wchar_t wstr[100];
StringCchPrintfW(wstr, 100, str); // BAD
wprintf(wstr); // BAD
}
const char* get_string();
void pointer_arithmetic_test_on_bad_string(){
{
const char *hello = get_string();
printf(hello + 1); // BAD
printf(hello); // BAD
}
{
const char *hello = get_string();
hello += 1;
printf(hello); // BAD
}
{
// Same as above block but using "x = x + 1" syntax
const char *hello = get_string();
hello = hello + 1;
printf(hello); // BAD
}
{
// Same as above block but using "x++" syntax
const char *hello = get_string();
hello++;
printf(hello); // BAD
}
{
// Same as above block but using "++x" as subexpression
const char *hello = get_string();
printf(++hello); // BAD
}
{
// Same as above block but through a pointer
const char *hello = get_string();
const char **p = &hello;
(*p)++;
printf(hello); // BAD
}
{
// Same as above block but through a C++ reference
const char *hello = get_string();
const char *&p = hello;
p++;
printf(hello); // BAD
}
{
const char *hello = get_string();
const char *const *p = &hello;
printf(hello); // BAD
}
}