diff --git a/python/ql/lib/semmle/python/frameworks/Lxml.qll b/python/ql/lib/semmle/python/frameworks/Lxml.qll index 60cc850fd34..821fc6bac80 100644 --- a/python/ql/lib/semmle/python/frameworks/Lxml.qll +++ b/python/ql/lib/semmle/python/frameworks/Lxml.qll @@ -274,4 +274,34 @@ private module Lxml { result = this } } + + /** + * A call to `lxml.etree.iterparse` + * + * See + * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.iterparse + */ + private class LXMLIterparseCall extends DataFlow::CallCfgNode, XML::XMLParsing::Range { + LXMLIterparseCall() { + this = API::moduleImport("lxml").getMember("etree").getMember("iterparse").getACall() + } + + override DataFlow::Node getAnInput() { result in [this.getArg(0), this.getArgByName("source")] } + + override predicate vulnerableTo(XML::XMLParsingVulnerabilityKind kind) { + // note that there is no `resolve_entities` argument, so it's not possible to turn off XXE :O + kind.isXxe() + or + (kind.isBillionLaughs() or kind.isQuadraticBlowup()) and + this.getArgByName("huge_tree").getALocalSource().asExpr() = any(True t) + or + kind.isDtdRetrieval() and + this.getArgByName("load_dtd").getALocalSource().asExpr() = any(True t) and + this.getArgByName("no_network").getALocalSource().asExpr() = any(False t) + } + + override predicate mayExecuteInput() { none() } + + override DataFlow::Node getOutput() { result = this } + } } diff --git a/python/ql/test/library-tests/frameworks/lxml/parsing.py b/python/ql/test/library-tests/frameworks/lxml/parsing.py index e69a68a6ad2..5abd626caf4 100644 --- a/python/ql/test/library-tests/frameworks/lxml/parsing.py +++ b/python/ql/test/library-tests/frameworks/lxml/parsing.py @@ -16,11 +16,15 @@ lxml.etree.XML(text=x) # $ decodeFormat=XML decodeInput=x xmlVuln='XXE' decodeOu lxml.etree.XMLID(x) # $ decodeFormat=XML decodeInput=x xmlVuln='XXE' decodeOutput=lxml.etree.XMLID(..) lxml.etree.XMLID(text=x) # $ decodeFormat=XML decodeInput=x xmlVuln='XXE' decodeOutput=lxml.etree.XMLID(..) -lxml.etree.parse(StringIO(x)) # $ decodeFormat=XML decodeInput=StringIO(..) xmlVuln='XXE' decodeOutput=lxml.etree.parse(..) -lxml.etree.parse(source=StringIO(x)) # $ decodeFormat=XML decodeInput=StringIO(..) xmlVuln='XXE' decodeOutput=lxml.etree.parse(..) +xml_file = 'xml_file' +lxml.etree.parse(xml_file) # $ decodeFormat=XML decodeInput=xml_file xmlVuln='XXE' decodeOutput=lxml.etree.parse(..) +lxml.etree.parse(source=xml_file) # $ decodeFormat=XML decodeInput=xml_file xmlVuln='XXE' decodeOutput=lxml.etree.parse(..) -lxml.etree.parseid(StringIO(x)) # $ decodeFormat=XML decodeInput=StringIO(..) xmlVuln='XXE' decodeOutput=lxml.etree.parseid(..) -lxml.etree.parseid(source=StringIO(x)) # $ decodeFormat=XML decodeInput=StringIO(..) xmlVuln='XXE' decodeOutput=lxml.etree.parseid(..) +lxml.etree.parseid(xml_file) # $ decodeFormat=XML decodeInput=xml_file xmlVuln='XXE' decodeOutput=lxml.etree.parseid(..) +lxml.etree.parseid(source=xml_file) # $ decodeFormat=XML decodeInput=xml_file xmlVuln='XXE' decodeOutput=lxml.etree.parseid(..) + +lxml.etree.iterparse(xml_file) # $ decodeFormat=XML decodeInput=xml_file xmlVuln='XXE' decodeOutput=lxml.etree.iterparse(..) +lxml.etree.iterparse(source=xml_file) # $ decodeFormat=XML decodeInput=xml_file xmlVuln='XXE' decodeOutput=lxml.etree.iterparse(..) # With default parsers (nothing changed) parser = lxml.etree.XMLParser() @@ -55,3 +59,9 @@ lxml.etree.fromstring(x, parser=parser) # $ decodeFormat=XML decodeInput=x decod # DTD retrival vuln (also XXE) parser = lxml.etree.XMLParser(load_dtd=True, no_network=False) lxml.etree.fromstring(x, parser=parser) # $ decodeFormat=XML decodeInput=x xmlVuln='DTD retrieval' xmlVuln='XXE' decodeOutput=lxml.etree.fromstring(..) + +# iterparse configurations ... this doesn't use a parser argument but takes MOST (!) of +# the normal XMLParser arguments. Specifically, it doesn't allow disabling XXE :O + +lxml.etree.iterparse(xml_file, huge_tree=True) # $ decodeFormat=XML decodeInput=xml_file xmlVuln='Billion Laughs' xmlVuln='Quadratic Blowup' xmlVuln='XXE' decodeOutput=lxml.etree.iterparse(..) +lxml.etree.iterparse(xml_file, load_dtd=True, no_network=False) # $ decodeFormat=XML decodeInput=xml_file xmlVuln='DTD retrieval' xmlVuln='XXE' decodeOutput=lxml.etree.iterparse(..)