From d2c98c86dcb4aecefe67f6767f78ea9ede629d4b Mon Sep 17 00:00:00 2001 From: Michael Nebel Date: Tue, 3 Sep 2024 10:57:00 +0200 Subject: [PATCH] Java: Improve content based model generation. --- .../modelgenerator/internal/CaptureModels.qll | 247 +++++++++++++++++- .../internal/CaptureModelsSpecific.qll | 33 ++- .../CaptureTypeBasedSummaryModels.qll | 2 +- .../mad/modelgenerator/ModelPrinting.qll | 43 +-- 4 files changed, 295 insertions(+), 30 deletions(-) diff --git a/java/ql/src/utils/modelgenerator/internal/CaptureModels.qll b/java/ql/src/utils/modelgenerator/internal/CaptureModels.qll index 0f24bab005e..9373382941a 100644 --- a/java/ql/src/utils/modelgenerator/internal/CaptureModels.qll +++ b/java/ql/src/utils/modelgenerator/internal/CaptureModels.qll @@ -127,7 +127,7 @@ string captureQualifierFlow(DataFlowSummaryTargetApi api) { api = returnNodeEnclosingCallable(ret) and isOwnInstanceAccessNode(ret) ) and - result = Printing::asValueModel(api, qualifierString(), "ReturnValue") + result = Printing::asLiftedValueModel(api, qualifierString(), "ReturnValue") } private int accessPathLimit0() { result = 2 } @@ -237,7 +237,7 @@ string captureThroughFlow0( input = parameterNodeAsInput(p) and output = getOutput(returnNodeExt) and input != output and - result = Printing::asTaintModel(api, input, output) + result = Printing::asLiftedTaintModel(api, input, output) ) } @@ -291,26 +291,259 @@ private string getContent(PropagateContentFlow::AccessPath ap, int i) { ) } +/** + * Gets the MaD string representation of a store step access path. + */ private string printStoreAccessPath(PropagateContentFlow::AccessPath ap) { result = concat(int i | | getContent(ap, i), "" order by i) } +/** + * Gets the MaD string representation of a read step access path. + */ private string printReadAccessPath(PropagateContentFlow::AccessPath ap) { result = concat(int i | | getContent(ap, i), "" order by i desc) } -string captureContentFlow(DataFlowSummaryTargetApi api) { +/** + * Holds if the access path `ap` contains a field or synthetic field access. + */ +private predicate mentionsField(PropagateContentFlow::AccessPath ap) { + exists(ContentSet head, PropagateContentFlow::AccessPath tail | + head = ap.getHead() and + tail = ap.getTail() and + (mentionsField(tail) or isField(head)) + ) +} + +private predicate apiFlow( + DataFlowSummaryTargetApi api, DataFlow::ParameterNode p, PropagateContentFlow::AccessPath reads, + ReturnNodeExt returnNodeExt, PropagateContentFlow::AccessPath stores, boolean preservesValue +) { + PropagateContentFlow::flow(p, reads, returnNodeExt, stores, preservesValue) and + returnNodeExt.getEnclosingCallable() = api and + p.getEnclosingCallable() = api +} + +/** + * A class of APIs relevant for modeling using content flow. + * The following heuristic is applied: + * Content flow is only relevant for an API, if + * #content flow <= 2 * #parameters + 3 + * If an API produces more content flow, it is likely that + * 1. Types are not sufficiently constrained leading to a combinatorial + * explosion in dispatch and thus in the generated summaries. + * 2. It is a reasonable approximation to use the non-content based flow + * detection instead, as reads and stores would use a significant + * part of an objects internal state. + */ +private class ContentDataFlowSummaryTargetApi extends DataFlowSummaryTargetApi { + ContentDataFlowSummaryTargetApi() { + count(string input, string output | + exists( + DataFlow::ParameterNode p, PropagateContentFlow::AccessPath reads, + ReturnNodeExt returnNodeExt, PropagateContentFlow::AccessPath stores + | + apiFlow(this, p, reads, returnNodeExt, stores, _) and + input = parameterNodeAsContentInput(p) + printReadAccessPath(reads) and + output = getContentOutput(returnNodeExt) + printStoreAccessPath(stores) + ) + ) <= 2 * this.getNumberOfParameters() + 3 + } +} + +pragma[nomagic] +private predicate apiContentFlow( + ContentDataFlowSummaryTargetApi api, DataFlow::ParameterNode p, + PropagateContentFlow::AccessPath reads, ReturnNodeExt returnNodeExt, + PropagateContentFlow::AccessPath stores, boolean preservesValue +) { + PropagateContentFlow::flow(p, reads, returnNodeExt, stores, preservesValue) and + returnNodeExt.getEnclosingCallable() = api and + p.getEnclosingCallable() = api +} + +/** + * Holds if any of the content sets in `path` translates into a synthetic field. + */ +private predicate hasSyntheticContent(PropagateContentFlow::AccessPath path) { + exists(PropagateContentFlow::AccessPath tail, ContentSet head | + head = path.getHead() and + tail = path.getTail() and + ( + exists(getSyntheticName(head)) or + hasSyntheticContent(tail) + ) + ) +} + +/** + * A module containing predicates for validating access paths containing content sets + * that translates into synthetic fields, when used for generated summary models. + */ +private module AccessPathSyntheticValidation { + /** + * Holds if there exists an API that has content flow from `read` (on type `t1`) + * to `store` (on type `t2`). + */ + private predicate step( + Type t1, PropagateContentFlow::AccessPath read, Type t2, PropagateContentFlow::AccessPath store + ) { + exists(DataFlow::ParameterNode p, ReturnNodeExt returnNodeExt | + p.getType() = t1 and + returnNodeExt.getType() = t2 and + apiContentFlow(_, p, read, returnNodeExt, store, _) + ) + } + + /** + * Holds if there exists an API that has content flow from `read` (on type `t1`) + * to `store` (on type `t2`), where `read` does not have synthetic content and `store` does. + * + * Step A -> Synth. + */ + private predicate synthPathEntry( + Type t1, PropagateContentFlow::AccessPath read, Type t2, PropagateContentFlow::AccessPath store + ) { + not hasSyntheticContent(read) and + hasSyntheticContent(store) and + step(t1, read, t2, store) + } + + /** + * Holds if there exists an API that has content flow from `read` (on type `t1`) + * to `store` (on type `t2`), where `read` has synthetic content + * and `store` does not. + * + * Step Synth -> A. + */ + private predicate synthPathExit( + Type t1, PropagateContentFlow::AccessPath read, Type t2, PropagateContentFlow::AccessPath store + ) { + hasSyntheticContent(read) and + not hasSyntheticContent(store) and + step(t1, read, t2, store) + } + + /** + * Takes one or more synthetic steps. + * Synth ->+ Synth + */ + private predicate synthPathStepRec( + Type t1, PropagateContentFlow::AccessPath read, Type t2, PropagateContentFlow::AccessPath store + ) { + hasSyntheticContent(read) and + hasSyntheticContent(store) and + ( + step(t1, read, t2, store) + or + exists(PropagateContentFlow::AccessPath mid, Type midType | + step(t1, read, midType, mid) and synthPathStepRec(midType, mid.reverse(), t2, store) + ) + ) + } + + /** + * Holds if there exists a path of steps from `read` to an exit. + * + * read ->* Synth -> A + */ + private predicate reachesSynthExit(Type t, PropagateContentFlow::AccessPath read) { + synthPathExit(t, read, _, _) + or + exists(PropagateContentFlow::AccessPath mid, Type midType | + synthPathStepRec(t, read, midType, mid) and synthPathExit(midType, mid.reverse(), _, _) + ) + } + + /** + * Holds if there exists a path of steps from an entry to `store`. + * + * A -> Synth ->* store + */ + private predicate synthEntryReaches(Type t, PropagateContentFlow::AccessPath store) { + synthPathEntry(_, _, t, store) + or + exists(PropagateContentFlow::AccessPath mid, Type midType | + synthPathEntry(_, _, midType, mid) and synthPathStepRec(midType, mid.reverse(), t, store) + ) + } + + /** + * Holds if at least one of the access paths `read` (on type `t1`) and `store` (on type `t2`) + * contain content that will be translated into a synthetic field, when being used in + * a MaD summary model, and if there is a range of APIs, such that + * when chaining their flow access paths, there exists access paths `A` and `B` where + * A ->* read -> store ->* B and where `A` and `B` do not contain content that will + * be translated into a synthetic field. + * + * This is needed because we don't want to include summaries that reads from or + * stores into a "dead" synthetic field. + * + * Example: + * Assume we have a type `t` (in this case `t1` = `t2`) with methods `getX` and + * `setX`, which gets and sets a private field `X` on `t`. + * This would lead to the following content flows + * getX : Argument[this].SyntheticField[t.X] -> ReturnValue. + * setX : Argument[0] -> Argument[this].SyntheticField[t.X] + * As the reads and stores are on synthetic fields we should only make summaries + * if both of these methods exist. + */ + pragma[nomagic] + predicate acceptReadStore( + Type t1, PropagateContentFlow::AccessPath read, Type t2, PropagateContentFlow::AccessPath store + ) { + synthPathEntry(t1, read, t2, store) and reachesSynthExit(t2, store.reverse()) + or + exists(PropagateContentFlow::AccessPath store0 | store0.reverse() = read | + synthEntryReaches(t1, store0) and synthPathExit(t1, read, t2, store) + or + synthEntryReaches(t1, store0) and + step(t1, read, t2, store) and + reachesSynthExit(t2, store.reverse()) + ) + } +} + +/** + * Holds, if the API `api` has relevant flow from `read` on `p` to `store` on `returnNodeExt`. + * Flow is considered relevant, + * 1. If `read` or `store` do not contain a content set that translates into a synthetic field. + * 2. If `read` or `store` contain a content set that translates into a synthetic field, and if + * the synthetic content is "live" on the relevant declaring type. + */ +private predicate apiRelevantContentFlow( + ContentDataFlowSummaryTargetApi api, DataFlow::ParameterNode p, + PropagateContentFlow::AccessPath read, ReturnNodeExt returnNodeExt, + PropagateContentFlow::AccessPath store, boolean preservesValue +) { + apiContentFlow(api, p, read, returnNodeExt, store, preservesValue) and + ( + not hasSyntheticContent(read) and not hasSyntheticContent(store) + or + AccessPathSyntheticValidation::acceptReadStore(p.getType(), read, returnNodeExt.getType(), store) + ) +} + +/** + * Gets the content based summary model(s) of the API `api` (if there is flow from a parameter to + * the return value or a parameter). + * + * Models are lifted to the best type in case the read and store access paths do not + * contain a field or synthetic field access. + */ +string captureContentFlow(ContentDataFlowSummaryTargetApi api) { exists( DataFlow::ParameterNode p, ReturnNodeExt returnNodeExt, string input, string output, PropagateContentFlow::AccessPath reads, PropagateContentFlow::AccessPath stores, - boolean preservesValue + boolean preservesValue, boolean lift | - PropagateContentFlow::flow(p, reads, returnNodeExt, stores, preservesValue) and - returnNodeExt.getEnclosingCallable() = api and + apiRelevantContentFlow(api, p, reads, returnNodeExt, stores, preservesValue) and input = parameterNodeAsContentInput(p) + printReadAccessPath(reads) and output = getContentOutput(returnNodeExt) + printStoreAccessPath(stores) and input != output and - result = Printing::asModel(api, input, output, preservesValue) + (if mentionsField(reads) or mentionsField(stores) then lift = false else lift = true) and + result = Printing::asModel(api, input, output, preservesValue, lift) ) } diff --git a/java/ql/src/utils/modelgenerator/internal/CaptureModelsSpecific.qll b/java/ql/src/utils/modelgenerator/internal/CaptureModelsSpecific.qll index 2be162d5f9b..363b81293dd 100644 --- a/java/ql/src/utils/modelgenerator/internal/CaptureModelsSpecific.qll +++ b/java/ql/src/utils/modelgenerator/internal/CaptureModelsSpecific.qll @@ -340,16 +340,35 @@ predicate isAdditionalContentFlowStep(DataFlow::Node node1, DataFlow::Node node2 } /** - * Gets the MaD string representation of the contentset `c`. + * Holds if the content set `c` is a field or a synthetic field. */ -string printContent(ContentSet c) { - exists(Field f, string name | - f = c.(DataFlowUtil::FieldContent).getField() and name = f.getQualifiedName() - | - if f.isPublic() then result = "Field[" + name + "]" else result = "SyntheticField[" + name + "]" +predicate isField(ContentSet c) { + c instanceof DataFlowUtil::FieldContent or + c instanceof DataFlowUtil::SyntheticFieldContent +} + +/** + * Gets the MaD synthetic name string representation for the content set `c`, if any. + */ +string getSyntheticName(DataFlow::ContentSet c) { + exists(Field f | + not f.isPublic() and + f = c.(DataFlowUtil::FieldContent).getField() and + result = f.getQualifiedName() ) or - result = "SyntheticField[" + c.(DataFlowUtil::SyntheticFieldContent).getField() + "]" + result = c.(DataFlowUtil::SyntheticFieldContent).getField() +} + +/** + * Gets the MaD string representation of the content set `c`. + */ +string printContent(ContentSet c) { + exists(Field f | f = c.(DataFlowUtil::FieldContent).getField() and f.isPublic() | + result = "Field[" + f.getQualifiedName() + "]" + ) + or + result = "SyntheticField[" + getSyntheticName(c) + "]" or c instanceof DataFlowUtil::CollectionContent and result = "Element" or diff --git a/java/ql/src/utils/modelgenerator/internal/CaptureTypeBasedSummaryModels.qll b/java/ql/src/utils/modelgenerator/internal/CaptureTypeBasedSummaryModels.qll index 34e53bad2d7..3d56dff5072 100644 --- a/java/ql/src/utils/modelgenerator/internal/CaptureTypeBasedSummaryModels.qll +++ b/java/ql/src/utils/modelgenerator/internal/CaptureTypeBasedSummaryModels.qll @@ -329,7 +329,7 @@ class TypeBasedFlowTargetApi extends Specific::SummaryTargetApi { output(this, tv, output) and input != output | - result = Printing::asValueModel(this, input, output) + result = Printing::asLiftedValueModel(this, input, output) ) } } diff --git a/shared/mad/codeql/mad/modelgenerator/ModelPrinting.qll b/shared/mad/codeql/mad/modelgenerator/ModelPrinting.qll index 2867d927984..4f5fa59d537 100644 --- a/shared/mad/codeql/mad/modelgenerator/ModelPrinting.qll +++ b/shared/mad/codeql/mad/modelgenerator/ModelPrinting.qll @@ -64,14 +64,23 @@ module ModelPrintingImpl { /** * Gets the summary model for `api` with `input`, `output` and `kind`. + * The model is lifted in case `lift` is true. */ bindingset[input, output, kind] - private string asSummaryModel(Printing::SummaryApi api, string input, string output, string kind) { - result = - asPartialModel(api.lift()) + input + ";" // - + output + ";" // - + kind + ";" // - + Printing::getProvenance() + private string asSummaryModel( + Printing::SummaryApi api, string input, string output, string kind, boolean lift + ) { + exists(Lang::Callable c | + lift = true and c = api.lift() + or + lift = false and c = api + | + result = + asPartialModel(c) + input + ";" // + + output + ";" // + + kind + ";" // + + Printing::getProvenance() + ) } string asNeutralSummaryModel(Printing::SummaryApi api) { @@ -82,31 +91,35 @@ module ModelPrintingImpl { } /** - * Gets the value summary model for `api` with `input` and `output`. + * Gets the lifted value summary model for `api` with `input` and `output`. */ bindingset[input, output] - string asValueModel(Printing::SummaryApi api, string input, string output) { - result = asSummaryModel(api, input, output, "value") + string asLiftedValueModel(Printing::SummaryApi api, string input, string output) { + result = asModel(api, input, output, true, true) } /** - * Gets the taint summary model for `api` with `input` and `output`. + * Gets the lifted taint summary model for `api` with `input` and `output`. */ bindingset[input, output] - string asTaintModel(Printing::SummaryApi api, string input, string output) { - result = asSummaryModel(api, input, output, "taint") + string asLiftedTaintModel(Printing::SummaryApi api, string input, string output) { + result = asModel(api, input, output, false, true) } /** * Gets the summary model for `api` with `input` and `output`. + * (1) If `preservesValue` is true a "value" model is created. + * (2) If `lift` is true the model is lifted to the best possible type. */ bindingset[input, output, preservesValue] - string asModel(Printing::SummaryApi api, string input, string output, boolean preservesValue) { + string asModel( + Printing::SummaryApi api, string input, string output, boolean preservesValue, boolean lift + ) { preservesValue = true and - result = asValueModel(api, input, output) + result = asSummaryModel(api, input, output, "value", lift) or preservesValue = false and - result = asTaintModel(api, input, output) + result = asSummaryModel(api, input, output, "taint", lift) } /**