/** * Provides classes modeling security-relevant aspects of the `lxml` PyPI package. * * See * - https://pypi.org/project/lxml/ * - https://lxml.de/tutorial.html */ private import python private import semmle.python.dataflow.new.DataFlow private import semmle.python.dataflow.new.TaintTracking private import semmle.python.Concepts private import semmle.python.ApiGraphs private import semmle.python.frameworks.data.ModelsAsData /** * INTERNAL: Do not use. * * Provides classes modeling security-relevant aspects of the `lxml` PyPI package * * See * - https://pypi.org/project/lxml/ * - https://lxml.de/tutorial.html */ module Lxml { /** Gets a reference to the `lxml.etree` module */ API::Node etreeRef() { result = API::moduleImport("lxml").getMember("etree") or result = ModelOutput::getATypeNode("lxml.etree~Alias") } // --------------------------------------------------------------------------- // XPath // --------------------------------------------------------------------------- /** * A class constructor compiling an XPath expression. * * from lxml import etree * find_text = etree.XPath("`sink`") * find_text = etree.ETXPath("`sink`") * * See * - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.XPath * - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.ETXPath */ private class XPathClassCall extends XML::XPathConstruction::Range, DataFlow::CallCfgNode { XPathClassCall() { this = etreeRef().getMember(["XPath", "ETXPath"]).getACall() } override DataFlow::Node getXPath() { result in [this.getArg(0), this.getArgByName("path")] } override string getName() { result = "lxml.etree" } } /** * A call to the `xpath` method of a parsed document. * * from lxml import etree * root = etree.fromstring(file(XML_DB).read(), XMLParser()) * find_text = root.xpath("`sink`") * * See https://lxml.de/apidoc/lxml.etree.html#lxml.etree._ElementTree.xpath * as well as * - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.parse * - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.fromstring * - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.fromstringlist * - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.HTML * - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.XML */ class XPathCall extends XML::XPathExecution::Range, DataFlow::CallCfgNode { XPathCall() { // TODO: lxml.etree.parseid()[0] will contain the root element from parsing // but we don't really have a way to model that nicely. this = [Element::instance(), ElementTree::instance()].getMember("xpath").getACall() } override DataFlow::Node getXPath() { result in [this.getArg(0), this.getArgByName("_path")] } override string getName() { result = "lxml.etree" } } class XPathEvaluatorCall extends XML::XPathExecution::Range, DataFlow::CallCfgNode { XPathEvaluatorCall() { this = etreeRef().getMember("XPathEvaluator").getReturn().getACall() } override DataFlow::Node getXPath() { result = this.getArg(0) } override string getName() { result = "lxml.etree" } } // --------------------------------------------------------------------------- // Parsing // --------------------------------------------------------------------------- /** * Provides models for `lxml.etree` parsers. * * See https://lxml.de/apidoc/lxml.etree.html?highlight=xmlparser#lxml.etree.XMLParser */ module XmlParser { /** * A source of instances of `lxml.etree` parsers, extend this class to model new instances. * * This can include instantiations of the class, return values from function * calls, or a special parameter that will be set when functions are called by an external * library. * * Use the predicate `XmlParser::instance()` to get references to instances of `lxml.etree` parsers. */ abstract class InstanceSource extends DataFlow::LocalSourceNode { /** Holds if this instance is vulnerable to `kind`. */ abstract predicate vulnerableTo(XML::XmlParsingVulnerabilityKind kind); } /** * A call to `lxml.etree.XMLParser`. * * See https://lxml.de/apidoc/lxml.etree.html?highlight=xmlparser#lxml.etree.XMLParser */ private class LxmlParser extends InstanceSource, API::CallNode { LxmlParser() { this = etreeRef().getMember("XMLParser").getACall() } // NOTE: it's not possible to change settings of a parser after constructing it override predicate vulnerableTo(XML::XmlParsingVulnerabilityKind kind) { kind.isXxe() and ( // resolve_entities has default True not exists(this.getArgByName("resolve_entities")) or this.getKeywordParameter("resolve_entities").getAValueReachingSink().asExpr() = any(True t) ) or kind.isDtdRetrieval() and this.getKeywordParameter("load_dtd").getAValueReachingSink().asExpr() = any(True t) and this.getKeywordParameter("no_network").getAValueReachingSink().asExpr() = any(False t) } } /** * A call to `lxml.etree.get_default_parser`. * * See https://lxml.de/apidoc/lxml.etree.html?highlight=xmlparser#lxml.etree.get_default_parser */ private class LxmlDefaultParser extends InstanceSource, DataFlow::CallCfgNode { LxmlDefaultParser() { this = etreeRef().getMember("get_default_parser").getACall() } override predicate vulnerableTo(XML::XmlParsingVulnerabilityKind kind) { // as highlighted by // https://lxml.de/apidoc/lxml.etree.html?highlight=xmlparser#lxml.etree.XMLParser // by default XXE is allow. so as long as the default parser has not been // overridden, the result is also vuln to XXE. kind.isXxe() // TODO: take into account that you can override the default parser with `lxml.etree.set_default_parser`. } } /** Gets a reference to an `lxml.etree` parsers instance, with origin in `origin` */ private DataFlow::TypeTrackingNode instance(DataFlow::TypeTracker t, InstanceSource origin) { t.start() and result = origin or exists(DataFlow::TypeTracker t2 | result = instance(t2, origin).track(t2, t)) } /** Gets a reference to an `lxml.etree` parsers instance, with origin in `origin` */ DataFlow::Node instance(InstanceSource origin) { instance(DataFlow::TypeTracker::end(), origin).flowsTo(result) } /** Gets a reference to an `lxml.etree` parser instance, that is vulnerable to `kind`. */ DataFlow::Node instanceVulnerableTo(XML::XmlParsingVulnerabilityKind kind) { exists(InstanceSource origin | result = instance(origin) and origin.vulnerableTo(kind)) } /** * A call to the `feed` method of an `lxml` parser. */ private class LxmlParserFeedCall extends DataFlow::MethodCallNode, XML::XmlParsing::Range { LxmlParserFeedCall() { this.calls(instance(_), "feed") } override DataFlow::Node getAnInput() { result in [this.getArg(0), this.getArgByName("data")] } override predicate vulnerableTo(XML::XmlParsingVulnerabilityKind kind) { this.calls(instanceVulnerableTo(kind), "feed") } override predicate mayExecuteInput() { none() } override DataFlow::Node getOutput() { exists(DataFlow::Node objRef | DataFlow::localFlow(this.getObject(), objRef) and result.(DataFlow::MethodCallNode).calls(objRef, "close") ) } } } /** * A call to either of: * - `lxml.etree.fromstring` * - `lxml.etree.fromstringlist` * - `lxml.etree.HTML` * - `lxml.etree.XML` * - `lxml.etree.XMLID` * - `lxml.etree.XMLDTDID` * - `lxml.etree.parse` * - `lxml.etree.parseid` * * See * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.fromstring * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.fromstringlist * - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.HTML * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.XML * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.XMLID * - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.XMLDTDID * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.parse * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.parseid */ private class LxmlParsing extends DataFlow::CallCfgNode, XML::XmlParsing::Range { string functionName; LxmlParsing() { functionName in [ "fromstring", "fromstringlist", "HTML", "XML", "XMLID", "XMLDTDID", "parse", "parseid" ] and this = etreeRef().getMember(functionName).getACall() } override DataFlow::Node getAnInput() { result in [ this.getArg(0), // fromstring / HTML / XML / XMLID / XMLDTDID this.getArgByName("text"), // fromstringlist this.getArgByName("strings"), // parse / parseid this.getArgByName("source"), ] } DataFlow::Node getParserArg() { result in [this.getArg(1), this.getArgByName("parser")] } override predicate vulnerableTo(XML::XmlParsingVulnerabilityKind kind) { this.getParserArg() = XmlParser::instanceVulnerableTo(kind) or kind.isXxe() and not exists(this.getParserArg()) and not functionName = "HTML" } override predicate mayExecuteInput() { none() } override DataFlow::Node getOutput() { // Note: for `parseid`/XMLID the result of the call is a tuple with `(root, dict)`, so // maybe we should not just say that the entire tuple is the decoding output... my // gut feeling is that THIS instance doesn't matter too much, but that it would be // nice to be able to do this in general. (this is a problem for both `lxml.etree` // and `xml.etree`) result = this } } /** * A call to `lxml.etree.ElementTree.parse` or `lxml.etree.ElementTree.parseid`, which * takes either a filename or a file-like object as argument. To capture the filename * for path-injection, we have this subclass. * * See * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.parse * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.parseid */ private class FileAccessFromLxmlParsing extends LxmlParsing, FileSystemAccess::Range { FileAccessFromLxmlParsing() { functionName in ["parse", "parseid"] // I considered whether we should try to reduce FPs from people passing file-like // objects, which will not be a file system access (and couldn't cause a // path-injection). // // I suppose that once we have proper flow-summary support for file-like objects, // we can make the XXE/XML-bomb sinks allow an access-path, while the // path-injection sink wouldn't, and then we will not end up with such FPs. } override DataFlow::Node getAPathArgument() { result = this.getAnInput() } } /** * A call to `lxml.etree.iterparse` * * See * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.iterparse */ private class LxmlIterparseCall extends API::CallNode, XML::XmlParsing::Range, FileSystemAccess::Range { LxmlIterparseCall() { this = etreeRef().getMember("iterparse").getACall() } override DataFlow::Node getAnInput() { result in [this.getArg(0), this.getArgByName("source")] } override predicate vulnerableTo(XML::XmlParsingVulnerabilityKind kind) { // note that there is no `resolve_entities` argument, so it's not possible to turn off XXE :O kind.isXxe() or // libxml2 has built-in protection against XML bombs via entity reference loop detection, // so lxml is not vulnerable to XML bomb attacks. kind.isDtdRetrieval() and this.getKeywordParameter("load_dtd").getAValueReachingSink().asExpr() = any(True t) and this.getKeywordParameter("no_network").getAValueReachingSink().asExpr() = any(False t) } override predicate mayExecuteInput() { none() } override DataFlow::Node getOutput() { result = this } override DataFlow::Node getAPathArgument() { result = this.getAnInput() } } /** Provides models for the `lxml.etree.Element` class. */ module Element { /** Gets a reference to the `Element` class. */ API::Node classRef() { result = etreeRef().getMember(["Element", "_Element"]) } /** * A source of `lxml.etree.Element` instances, extend this class to model new instances. * * This can include instantiations of the class, return values from function * calls, or a special parameter that will be set when functions are called by an external * library. * * Use the predicate `Element::instance()` to get references to instances of `lxml.etree.Element` instances. */ abstract class InstanceSource instanceof API::Node { /** Gets a textual representation of this element. */ string toString() { result = super.toString() } } /** Gets a reference to an `lxml.etree.Element` instance. */ API::Node instance() { result instanceof InstanceSource } /** An `Element` instantiated directly. */ private class ElementInstance extends InstanceSource { ElementInstance() { this = classRef().getAnInstance() } } /** The result of a parse operation that returns an `Element`. */ private class ParseResult extends InstanceSource { ParseResult() { // TODO: The XmlParser module does not currently use API graphs this = [ etreeRef().getMember("XMLParser").getAnInstance(), etreeRef().getMember("get_default_parser").getReturn() ].getMember("close").getReturn() or // TODO: `XMLID`, `XMLDTDID`, `parseid` returns a tuple of which the first element is an `Element`. // `iterparse` returns an iterator of tuples, each of which has a second element that is an `Element`. this = etreeRef().getMember(["XML", "HTML", "fromstring", "fromstringlist"]).getReturn() } } /** A call to a method on an `Element` that returns another `Element`. */ private class ElementMethod extends InstanceSource { ElementMethod() { // an Element is an iterator of Elements this = instance().getASubscript() or // methods that return an Element this = instance().getMember(["find", "getnext", "getprevious", "getparent"]).getReturn() or // methods that return an iterator of Elements this = instance() .getMember([ "cssselect", "findall", "getchildren", "getiterator", "iter", "iterancestors", "iterdecendants", "iterchildren", "itersiblings", "iterfind", "xpath" ]) .getReturn() .getASubscript() } } /** A call to a method on an `ElementTree` that returns an `Element`. */ private class ElementTreeMethod extends InstanceSource { ElementTreeMethod() { this = ElementTree::instance().getMember(["getroot", "find"]).getReturn() or this = ElementTree::instance() .getMember(["findall", "getiterator", "iter", "iterfind", "xpath"]) .getReturn() .getASubscript() } } /** * An additional taint step from an `Element` instance. * See https://lxml.de/apidoc/lxml.etree.html#lxml.etree.ElementBase. */ private class ElementTaintStep extends TaintTracking::AdditionalTaintStep { override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) { exists(DataFlow::MethodCallNode call | nodeTo = call and instance().asSource().flowsTo(nodeFrom) | call.calls(nodeFrom, // We consider a node to be tainted if there could be taint anywhere in the element tree; // so sibling nodes (e.g. `getnext`) are also tainted. // This ensures nodes like `elem[0].getnext()` are tracked. [ "cssselect", "find", "findall", "findtext", "get", "getchildren", "getiterator", "getnext", "getparent", "getprevious", "getroottree", "items", "iter", "iterancestors", "iterchildren", "iterdescendants", "itersiblings", "iterfind", "itertext", "keys", "values", "xpath" ]) ) or exists(DataFlow::AttrRead attr | nodeTo = attr and instance().asSource().flowsTo(nodeFrom) | attr.accesses(nodeFrom, ["attrib", "base", "nsmap", "prefix", "tag", "tail", "text"]) ) } } } /** Provides models for the `lxml.etree.ElementTree` class. */ module ElementTree { /** Gets a reference to the `ElementTree` class. */ API::Node classRef() { result = etreeRef().getMember(["ElementTree", "_ElementTree"]) } /** * A source of `lxml.etree.ElementTree` instances; extend this class to model new instances. * * This can include instantiations of the class, return values from function * calls, or a special parameter that will be set when functions are called by an external * library. * * Use the predicate `ElementTree::instance()` to get references to instances of `lxml.etree.ElementTree` instances. */ abstract class InstanceSource instanceof API::Node { /** Gets a textual representation of this element. */ string toString() { result = super.toString() } } /** Gets a reference to an `lxml.etree.ElementTree` instance. */ API::Node instance() { result instanceof InstanceSource } /** An `ElementTree` instantiated directly. */ private class ElementTreeInstance extends InstanceSource { ElementTreeInstance() { this = classRef().getAnInstance() } } /** The result of a parse operation that returns an `ElementTree`. */ private class ParseResult extends InstanceSource { ParseResult() { this = etreeRef().getMember("parse").getReturn() } } /** A call to a method on an `Element` that returns another `Element`. */ private class ElementMethod extends InstanceSource { ElementMethod() { this = Element::instance().getMember("getroottree").getReturn() } } /** An additional taint step from an `ElementTree` instance. See https://lxml.de/apidoc/lxml.etree.html#lxml.etree._ElementTree */ private class ElementTaintStep extends TaintTracking::AdditionalTaintStep { override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) { exists(DataFlow::MethodCallNode call | nodeTo = call and instance().asSource().flowsTo(nodeFrom) | call.calls(nodeFrom, [ "find", "findall", "findtext", "get", "getiterator", "getroot", "iter", "iterfind", "xpath" ]) ) or exists(DataFlow::AttrRead attr | nodeTo = attr and instance().asSource().flowsTo(nodeFrom) | attr.accesses(nodeFrom, "docinfo") ) } } } /** A call to serialise xml to a string */ private class XmlEncoding extends Encoding::Range, DataFlow::CallCfgNode { XmlEncoding() { this = etreeRef().getMember(["tostring", "tostringlist", "tounicode"]).getACall() } override DataFlow::Node getAnInput() { result = [this.getArg(0), this.getArgByName("element_or_tree")] } override DataFlow::Node getOutput() { result = this } override string getFormat() { result = "XML" } } // TODO: ElementTree.write can write to a file-like object; should that be a flow step? // It also can accept a filepath which could be a path injection sink. }