diff --git a/python/change-notes/2021-10-08-improve-pickle-dill-shelve-modeling.md b/python/change-notes/2021-10-08-improve-pickle-dill-shelve-modeling.md new file mode 100644 index 00000000000..48c868ee416 --- /dev/null +++ b/python/change-notes/2021-10-08-improve-pickle-dill-shelve-modeling.md @@ -0,0 +1,2 @@ +lgtm,codescanning +* Improved modeling of decoding through pickle related functions (which can lead to code execution), resulting in additional sinks for the _Deserializing untrusted input_ query (`py/unsafe-deserialization`). Now we fully support `pickle.load`, `pickle.loads`, `pickle.Unpickler`, `marshal.load`, `marshal.loads`, `dill.load`, `dill.loads`, `shelve.open`. diff --git a/python/ql/lib/semmle/python/frameworks/Dill.qll b/python/ql/lib/semmle/python/frameworks/Dill.qll index 94af905756b..168542272f9 100644 --- a/python/ql/lib/semmle/python/frameworks/Dill.qll +++ b/python/ql/lib/semmle/python/frameworks/Dill.qll @@ -1,5 +1,5 @@ /** - * Provides classes modeling security-relevant aspects of the 'dill' package. + * Provides classes modeling security-relevant aspects of the `dill` PyPI package. * See https://pypi.org/project/dill/. */ @@ -10,18 +10,41 @@ private import semmle.python.Concepts private import semmle.python.ApiGraphs /** - * A call to `dill.loads` - * See https://pypi.org/project/dill/ (which currently refers you - * to https://docs.python.org/3/library/pickle.html#pickle.loads) + * Provides models for the `dill` PyPI package. + * See https://pypi.org/project/dill/. */ -private class DillLoadsCall extends Decoding::Range, DataFlow::CallCfgNode { - DillLoadsCall() { this = API::moduleImport("dill").getMember("loads").getACall() } +private module Dill { + /** + * A call to `dill.load` + * See https://pypi.org/project/dill/ (which currently refers you + * to https://docs.python.org/3/library/pickle.html#pickle.load) + */ + private class DillLoadCall extends Decoding::Range, DataFlow::CallCfgNode { + DillLoadCall() { this = API::moduleImport("dill").getMember("load").getACall() } - override predicate mayExecuteInput() { any() } + override predicate mayExecuteInput() { any() } - override DataFlow::Node getAnInput() { result = this.getArg(0) } + override DataFlow::Node getAnInput() { result in [this.getArg(0), this.getArgByName("file")] } - override DataFlow::Node getOutput() { result = this } + override DataFlow::Node getOutput() { result = this } - override string getFormat() { result = "dill" } + override string getFormat() { result = "dill" } + } + + /** + * A call to `dill.loads` + * See https://pypi.org/project/dill/ (which currently refers you + * to https://docs.python.org/3/library/pickle.html#pickle.loads) + */ + private class DillLoadsCall extends Decoding::Range, DataFlow::CallCfgNode { + DillLoadsCall() { this = API::moduleImport("dill").getMember("loads").getACall() } + + override predicate mayExecuteInput() { any() } + + override DataFlow::Node getAnInput() { result in [this.getArg(0), this.getArgByName("str")] } + + override DataFlow::Node getOutput() { result = this } + + override string getFormat() { result = "dill" } + } } diff --git a/python/ql/lib/semmle/python/frameworks/Stdlib.qll b/python/ql/lib/semmle/python/frameworks/Stdlib.qll index 539f0dcabb0..f89f74f3c34 100644 --- a/python/ql/lib/semmle/python/frameworks/Stdlib.qll +++ b/python/ql/lib/semmle/python/frameworks/Stdlib.qll @@ -428,6 +428,22 @@ private module StdlibPrivate { // --------------------------------------------------------------------------- // marshal // --------------------------------------------------------------------------- + /** + * A call to `marshal.load` + * See https://docs.python.org/3/library/marshal.html#marshal.load + */ + private class MarshalLoadCall extends Decoding::Range, DataFlow::CallCfgNode { + MarshalLoadCall() { this = API::moduleImport("marshal").getMember("load").getACall() } + + override predicate mayExecuteInput() { any() } + + override DataFlow::Node getAnInput() { result = this.getArg(0) } + + override DataFlow::Node getOutput() { result = this } + + override string getFormat() { result = "marshal" } + } + /** * A call to `marshal.loads` * See https://docs.python.org/3/library/marshal.html#marshal.loads @@ -447,15 +463,23 @@ private module StdlibPrivate { // --------------------------------------------------------------------------- // pickle // --------------------------------------------------------------------------- - /** Gets a reference to the `pickle` module. */ - DataFlow::Node pickle() { result = API::moduleImport(["pickle", "cPickle", "_pickle"]).getAUse() } + /** Gets a reference to any of the `pickle` modules. */ + API::Node pickle() { result = API::moduleImport(["pickle", "cPickle", "_pickle"]) } - /** Provides models for the `pickle` module. */ - module pickle { - /** Gets a reference to the `pickle.loads` function. */ - DataFlow::Node loads() { - result = API::moduleImport(["pickle", "cPickle", "_pickle"]).getMember("loads").getAUse() - } + /** + * A call to `pickle.load` + * See https://docs.python.org/3/library/pickle.html#pickle.load + */ + private class PickleLoadCall extends Decoding::Range, DataFlow::CallCfgNode { + PickleLoadCall() { this = pickle().getMember("load").getACall() } + + override predicate mayExecuteInput() { any() } + + override DataFlow::Node getAnInput() { result in [this.getArg(0), this.getArgByName("file")] } + + override DataFlow::Node getOutput() { result = this } + + override string getFormat() { result = "pickle" } } /** @@ -463,11 +487,63 @@ private module StdlibPrivate { * See https://docs.python.org/3/library/pickle.html#pickle.loads */ private class PickleLoadsCall extends Decoding::Range, DataFlow::CallCfgNode { - PickleLoadsCall() { this.getFunction() = pickle::loads() } + PickleLoadsCall() { this = pickle().getMember("loads").getACall() } override predicate mayExecuteInput() { any() } - override DataFlow::Node getAnInput() { result = this.getArg(0) } + override DataFlow::Node getAnInput() { result in [this.getArg(0), this.getArgByName("data")] } + + override DataFlow::Node getOutput() { result = this } + + override string getFormat() { result = "pickle" } + } + + /** + * A construction of a `pickle.Unpickler` + * See https://docs.python.org/3/library/pickle.html#pickle.Unpickler + */ + private class PickleUnpicklerCall extends Decoding::Range, DataFlow::CallCfgNode { + PickleUnpicklerCall() { this = pickle().getMember("Unpickler").getACall() } + + override predicate mayExecuteInput() { any() } + + override DataFlow::Node getAnInput() { result in [this.getArg(0), this.getArgByName("file")] } + + override DataFlow::Node getOutput() { result = this.getAMethodCall("load") } + + override string getFormat() { result = "pickle" } + } + + // --------------------------------------------------------------------------- + // shelve + // --------------------------------------------------------------------------- + /** + * A call to `shelve.open` + * See https://docs.python.org/3/library/shelve.html#shelve.open + * + * Claiming there is decoding of the input to `shelve.open` is a bit questionable, since + * it's not the filename, but the contents of the file that is decoded. + * + * However, we definitely want to be able to alert if a user is able to control what + * file is used, since that can lead to code execution (even if that file is free of + * path injection). + * + * So right now the best way we have of modeling this seems to be to treat the filename + * argument as being deserialized... + */ + private class ShelveOpenCall extends Decoding::Range, FileSystemAccess::Range, + DataFlow::CallCfgNode { + ShelveOpenCall() { this = API::moduleImport("shelve").getMember("open").getACall() } + + override predicate mayExecuteInput() { any() } + + override DataFlow::Node getAnInput() { + result in [this.getArg(0), this.getArgByName("filename")] + } + + override DataFlow::Node getAPathArgument() { + result in [this.getArg(0), this.getArgByName("filename")] + } override DataFlow::Node getOutput() { result = this } diff --git a/python/ql/test/library-tests/frameworks/dill/Decoding.py b/python/ql/test/library-tests/frameworks/dill/Decoding.py index 49eb551af04..bcbd6ff4b35 100644 --- a/python/ql/test/library-tests/frameworks/dill/Decoding.py +++ b/python/ql/test/library-tests/frameworks/dill/Decoding.py @@ -1,3 +1,6 @@ import dill -dill.loads(payload) # $decodeInput=payload decodeOutput=dill.loads(..) decodeFormat=dill decodeMayExecuteInput +dill.load(file_) # $ decodeInput=file_ decodeOutput=dill.load(..) decodeFormat=dill decodeMayExecuteInput +dill.load(file=file_) # $ decodeInput=file_ decodeOutput=dill.load(..) decodeFormat=dill decodeMayExecuteInput +dill.loads(payload) # $ decodeInput=payload decodeOutput=dill.loads(..) decodeFormat=dill decodeMayExecuteInput +dill.loads(str=payload) # $ decodeInput=payload decodeOutput=dill.loads(..) decodeFormat=dill decodeMayExecuteInput diff --git a/python/ql/test/library-tests/frameworks/stdlib/Decoding.py b/python/ql/test/library-tests/frameworks/stdlib/Decoding.py index 5d157a61f6e..e9e811ffc97 100644 --- a/python/ql/test/library-tests/frameworks/stdlib/Decoding.py +++ b/python/ql/test/library-tests/frameworks/stdlib/Decoding.py @@ -1,10 +1,31 @@ import pickle import marshal +import shelve import base64 +pickle.load(file_) # $ decodeInput=file_ decodeOutput=pickle.load(..) decodeFormat=pickle decodeMayExecuteInput +pickle.load(file=file_) # $ decodeInput=file_ decodeOutput=pickle.load(..) decodeFormat=pickle decodeMayExecuteInput pickle.loads(payload) # $ decodeInput=payload decodeOutput=pickle.loads(..) decodeFormat=pickle decodeMayExecuteInput +# using this keyword argument is disallowed from Python 3.9 +pickle.loads(data=payload) # $ decodeInput=payload decodeOutput=pickle.loads(..) decodeFormat=pickle decodeMayExecuteInput + +# We don't really have a good way to model a decode happening over multiple statements +# like this. Since the important bit for `py/unsafe-deserialization` is the input, that +# is the main focus. We do a best effort to model the output though (but that will only +# work in local scope). +unpickler = pickle.Unpickler(file_) # $ decodeInput=file_ decodeFormat=pickle decodeMayExecuteInput +unpickler.load() # $ decodeOutput=unpickler.load() +unpickler = pickle.Unpickler(file=file_) # $ decodeInput=file_ decodeFormat=pickle decodeMayExecuteInput + +marshal.load(file_) # $ decodeInput=file_ decodeOutput=marshal.load(..) decodeFormat=marshal decodeMayExecuteInput marshal.loads(payload) # $ decodeInput=payload decodeOutput=marshal.loads(..) decodeFormat=marshal decodeMayExecuteInput + +# if the file opened has been controlled by an attacker, this can lead to code +# execution. (underlying file format is pickle) +shelve.open(filepath) # $ decodeInput=filepath decodeOutput=shelve.open(..) decodeFormat=pickle decodeMayExecuteInput getAPathArgument=filepath +shelve.open(filename=filepath) # $ decodeInput=filepath decodeOutput=shelve.open(..) decodeFormat=pickle decodeMayExecuteInput getAPathArgument=filepath + # TODO: These tests should be merged with python/ql/test/experimental/dataflow/tainttracking/defaultAdditionalTaintStep/test_string.py base64.b64decode(payload) # $ decodeInput=payload decodeOutput=base64.b64decode(..) decodeFormat=Base64 base64.standard_b64decode(payload) # $ decodeInput=payload decodeOutput=base64.standard_b64decode(..) decodeFormat=Base64