diff --git a/python/ql/lib/semmle/python/frameworks/Pandas.qll b/python/ql/lib/semmle/python/frameworks/Pandas.qll index 254b49e89fb..244c11377fd 100644 --- a/python/ql/lib/semmle/python/frameworks/Pandas.qll +++ b/python/ql/lib/semmle/python/frameworks/Pandas.qll @@ -35,96 +35,99 @@ private module Pandas { override string getFormat() { result = "pickle" } } + /** + * Provides security related models for `pandas.DataFrame`. + * See https://pandas.pydata.org/docs/reference/frame.html + */ module DataFrame { /** * A `pandas.DataFrame` Object. + * + * Extend this class to model new APIs. * See https://pandas.pydata.org/docs/reference/frame.html */ - abstract class Range extends API::Node { + abstract class DataFrame extends API::Node { override string toString() { result = this.(API::Node).toString() } } - } - /** - * The `pandas.DataFrame` Objects including secondary `pandas.DataFrame` Objects. - * Use this class where you want to find all `pandas.DataFrame` Objects. - * See https://pandas.pydata.org/pandas-docs/stable/reference/frame.html - */ - class DataFrame extends API::Node { - DataFrame() { - this = any(DataFrame::Range df) - or - exists(API::Node dataFrame | dataFrame = any(DataFrame::Range df) | - this = - dataFrame - .getMember([ - "copy", "from_records", "from_dict", "from_spmatrix", "assign", "select_dtypes", - "set_flags", "astype", "infer_objects", "head", "xs", "get", "isin", "where", - "mask", "query", "add", "mul", "truediv", "mod", "pow", "dot", "radd", "rsub", - "rdiv", "rfloordiv", "rtruediv", "rpow", "lt", "gt", "le", "ne", "agg", "combine", - "apply", "aggregate", "transform", "all", "any", "clip", "corr", "cov", "cummax", - "cummin", "cumprod", "describe", "mode", "pct_change", "quantile", "rank", - "round", "sem", "add_prefix", "add_suffix", "at_time", "between_time", "drop", - "drop_duplicates", "filter", "first", "head", "idxmin", "last", "reindex", - "reindex_like", "reset_index", "sample", "set_axis", "tail", "take", "truncate", - "bfill", "dropna", "ffill", "fillna", "interpolate", "isna", "isnull", "notna", - "notnull", "pad", "replace", "droplevel", "pivot", "pivot_table", - "reorder_levels", "sort_values", "sort_index", "nlargest", "nsmallest", - "swaplevel", "stack", "unstack", "isnull", "notna", "notnull", "replace", - "droplevel", "pivot", "pivot_table", "reorder_levels", "sort_values", - "sort_index", "nlargest", "nsmallest", "swaplevel", "stack", "unstack", "melt", - "explode", "squeeze", "T", "transpose", "compare", "join", "from_spmatrix", - "shift", "asof", "merge", "from_dict", "tz_convert", "to_period", "asfreq", - "to_dense", "tz_localize", "box", "__dataframe__" - ]) - .getReturn() - ) + /** + * A `pandas.DataFrame` instantiation. + * See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html + */ + class DataFrameConstructor extends DataFrame { + DataFrameConstructor() { + this = API::moduleImport("pandas").getMember("DataFrame").getReturn() + } } - override string toString() { result = this.(API::Node).toString() } - } - - /** - * A `pandas.DataFrame` instantiation. - * See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html - */ - class DataFrameConstructor extends DataFrame::Range { - DataFrameConstructor() { this = API::moduleImport("pandas").getMember("DataFrame").getReturn() } - } - - /** - * The `pandas.read_*` functions that return a `pandas.DataFrame`. - * See https://pandas.pydata.org/docs/reference/io.html - */ - class InputRead extends DataFrame::Range { - InputRead() { - this = - API::moduleImport("pandas") - .getMember([ - "read_csv", "read_fwf", "read_pickle", "read_table", "read_clipboard", "read_excel", - "read_xml", "read_parquet", "read_orc", "read_spss", "read_sql_table", - "read_sql_query", "read_sql", "read_gbq", "read_stata" - ]) - .getReturn() - or - this = API::moduleImport("pandas").getMember("read_html").getReturn().getASubscript() - or - exists(API::Node readSas, API::CallNode readSasCall | - readSas = API::moduleImport("pandas").getMember("read_sas") and - this = readSas.getReturn() and - readSasCall = readSas.getACall() - | - // Returns DataFrame if iterator=False and chunksize=None, With default values it returns DataFrame. - ( - not readSasCall.getParameter(5, "iterator").asSink().asExpr().(BooleanLiteral) instanceof - True - or - not exists(readSasCall.getParameter(5, "iterator").asSink()) - ) and - not exists( - readSasCall.getParameter(4, "chunksize").asSink().asExpr().(IntegerLiteral).getN() + /** + * The `pandas.read_*` functions that return a `pandas.DataFrame`. + * See https://pandas.pydata.org/docs/reference/io.html + */ + class InputRead extends DataFrame { + InputRead() { + this = + API::moduleImport("pandas") + .getMember([ + "read_csv", "read_fwf", "read_pickle", "read_table", "read_clipboard", + "read_excel", "read_xml", "read_parquet", "read_orc", "read_spss", + "read_sql_table", "read_sql_query", "read_sql", "read_gbq", "read_stata" + ]) + .getReturn() + or + this = API::moduleImport("pandas").getMember("read_html").getReturn().getASubscript() + or + exists(API::Node readSas, API::CallNode readSasCall | + readSas = API::moduleImport("pandas").getMember("read_sas") and + this = readSas.getReturn() and + readSasCall = readSas.getACall() + | + // Returns DataFrame if iterator=False and chunksize=None, Also with default values it returns DataFrame. + ( + not readSasCall.getParameter(5, "iterator").asSink().asExpr().(BooleanLiteral) + instanceof True + or + not exists(readSasCall.getParameter(5, "iterator").asSink()) + ) and + not exists( + readSasCall.getParameter(4, "chunksize").asSink().asExpr().(IntegerLiteral).getN() + ) ) - ) + } + } + + /** + * The `pandas.DataFrame.*` methods that return a `pandas.DataFrame` object. + * See https://pandas.pydata.org/docs/reference/io.html + */ + class DataFrameMethods extends DataFrame { + DataFrameMethods() { + exists(API::Node dataFrame | dataFrame = any(DataFrame df) | + this = + dataFrame + .getMember([ + "copy", "from_records", "from_dict", "from_spmatrix", "assign", "select_dtypes", + "set_flags", "astype", "infer_objects", "head", "xs", "get", "isin", "where", + "mask", "query", "add", "mul", "truediv", "mod", "pow", "dot", "radd", "rsub", + "rdiv", "rfloordiv", "rtruediv", "rpow", "lt", "gt", "le", "ne", "agg", + "combine", "apply", "aggregate", "transform", "all", "any", "clip", "corr", + "cov", "cummax", "cummin", "cumprod", "describe", "mode", "pct_change", + "quantile", "rank", "round", "sem", "add_prefix", "add_suffix", "at_time", + "between_time", "drop", "drop_duplicates", "filter", "first", "head", "idxmin", + "last", "reindex", "reindex_like", "reset_index", "sample", "set_axis", "tail", + "take", "truncate", "bfill", "dropna", "ffill", "fillna", "interpolate", "isna", + "isnull", "notna", "notnull", "pad", "replace", "droplevel", "pivot", + "pivot_table", "reorder_levels", "sort_values", "sort_index", "nlargest", + "nsmallest", "swaplevel", "stack", "unstack", "isnull", "notna", "notnull", + "replace", "droplevel", "pivot", "pivot_table", "reorder_levels", "sort_values", + "sort_index", "nlargest", "nsmallest", "swaplevel", "stack", "unstack", "melt", + "explode", "squeeze", "T", "transpose", "compare", "join", "from_spmatrix", + "shift", "asof", "merge", "from_dict", "tz_convert", "to_period", "asfreq", + "to_dense", "tz_localize", "box", "__dataframe__" + ]) + .getReturn() + ) + } } } @@ -134,7 +137,9 @@ private module Pandas { * https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.eval.html */ class DataFlowQueryCall extends CodeExecution::Range, API::CallNode { - DataFlowQueryCall() { this = any(DataFrame df).getMember(["query", "eval"]).getACall() } + DataFlowQueryCall() { + this = any(DataFrame::DataFrame df).getMember(["query", "eval"]).getACall() + } override DataFlow::Node getCode() { result = this.getParameter(0, "expr").asSink() } }