diff --git a/python/ql/lib/semmle/python/frameworks/Pandas.qll b/python/ql/lib/semmle/python/frameworks/Pandas.qll index 09c6513b645..254b49e89fb 100644 --- a/python/ql/lib/semmle/python/frameworks/Pandas.qll +++ b/python/ql/lib/semmle/python/frameworks/Pandas.qll @@ -34,4 +34,118 @@ private module Pandas { override string getFormat() { result = "pickle" } } + + module DataFrame { + /** + * A `pandas.DataFrame` Object. + * See https://pandas.pydata.org/docs/reference/frame.html + */ + abstract class Range extends API::Node { + override string toString() { result = this.(API::Node).toString() } + } + } + + /** + * The `pandas.DataFrame` Objects including secondary `pandas.DataFrame` Objects. + * Use this class where you want to find all `pandas.DataFrame` Objects. + * See https://pandas.pydata.org/pandas-docs/stable/reference/frame.html + */ + class DataFrame extends API::Node { + DataFrame() { + this = any(DataFrame::Range df) + or + exists(API::Node dataFrame | dataFrame = any(DataFrame::Range df) | + this = + dataFrame + .getMember([ + "copy", "from_records", "from_dict", "from_spmatrix", "assign", "select_dtypes", + "set_flags", "astype", "infer_objects", "head", "xs", "get", "isin", "where", + "mask", "query", "add", "mul", "truediv", "mod", "pow", "dot", "radd", "rsub", + "rdiv", "rfloordiv", "rtruediv", "rpow", "lt", "gt", "le", "ne", "agg", "combine", + "apply", "aggregate", "transform", "all", "any", "clip", "corr", "cov", "cummax", + "cummin", "cumprod", "describe", "mode", "pct_change", "quantile", "rank", + "round", "sem", "add_prefix", "add_suffix", "at_time", "between_time", "drop", + "drop_duplicates", "filter", "first", "head", "idxmin", "last", "reindex", + "reindex_like", "reset_index", "sample", "set_axis", "tail", "take", "truncate", + "bfill", "dropna", "ffill", "fillna", "interpolate", "isna", "isnull", "notna", + "notnull", "pad", "replace", "droplevel", "pivot", "pivot_table", + "reorder_levels", "sort_values", "sort_index", "nlargest", "nsmallest", + "swaplevel", "stack", "unstack", "isnull", "notna", "notnull", "replace", + "droplevel", "pivot", "pivot_table", "reorder_levels", "sort_values", + "sort_index", "nlargest", "nsmallest", "swaplevel", "stack", "unstack", "melt", + "explode", "squeeze", "T", "transpose", "compare", "join", "from_spmatrix", + "shift", "asof", "merge", "from_dict", "tz_convert", "to_period", "asfreq", + "to_dense", "tz_localize", "box", "__dataframe__" + ]) + .getReturn() + ) + } + + override string toString() { result = this.(API::Node).toString() } + } + + /** + * A `pandas.DataFrame` instantiation. + * See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html + */ + class DataFrameConstructor extends DataFrame::Range { + DataFrameConstructor() { this = API::moduleImport("pandas").getMember("DataFrame").getReturn() } + } + + /** + * The `pandas.read_*` functions that return a `pandas.DataFrame`. + * See https://pandas.pydata.org/docs/reference/io.html + */ + class InputRead extends DataFrame::Range { + InputRead() { + this = + API::moduleImport("pandas") + .getMember([ + "read_csv", "read_fwf", "read_pickle", "read_table", "read_clipboard", "read_excel", + "read_xml", "read_parquet", "read_orc", "read_spss", "read_sql_table", + "read_sql_query", "read_sql", "read_gbq", "read_stata" + ]) + .getReturn() + or + this = API::moduleImport("pandas").getMember("read_html").getReturn().getASubscript() + or + exists(API::Node readSas, API::CallNode readSasCall | + readSas = API::moduleImport("pandas").getMember("read_sas") and + this = readSas.getReturn() and + readSasCall = readSas.getACall() + | + // Returns DataFrame if iterator=False and chunksize=None, With default values it returns DataFrame. + ( + not readSasCall.getParameter(5, "iterator").asSink().asExpr().(BooleanLiteral) instanceof + True + or + not exists(readSasCall.getParameter(5, "iterator").asSink()) + ) and + not exists( + readSasCall.getParameter(4, "chunksize").asSink().asExpr().(IntegerLiteral).getN() + ) + ) + } + } + + /** + * A Call to `pandas.DataFrame.query` or `pandas.DataFrame.eval`. + * See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html + * https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.eval.html + */ + class DataFlowQueryCall extends CodeExecution::Range, API::CallNode { + DataFlowQueryCall() { this = any(DataFrame df).getMember(["query", "eval"]).getACall() } + + override DataFlow::Node getCode() { result = this.getParameter(0, "expr").asSink() } + } + + /** + * A Call to `pandas.eval`. + * See https://pandas.pydata.org/docs/reference/api/pandas.eval.html + */ + class PandasEval extends CodeExecution::Range, API::CallNode { + PandasEval() { this = API::moduleImport("pandas").getMember("eval").getACall() } + + override DataFlow::Node getCode() { result = this.getParameter(0, "expr").asSink() } + } } diff --git a/python/ql/test/library-tests/frameworks/pandas/dataframe_query.py b/python/ql/test/library-tests/frameworks/pandas/dataframe_query.py new file mode 100644 index 00000000000..a524fa21445 --- /dev/null +++ b/python/ql/test/library-tests/frameworks/pandas/dataframe_query.py @@ -0,0 +1,85 @@ +import pandas as pd + + +df = pd.DataFrame({'temp_c': [17.0, 25.0]}, index=['Portland', 'Berkeley']) +df.sample().query("query") # $getCode="query" +df.mod().query("query") # $getCode="query" +pd.eval("pythonExpr", target=df) # $getCode="pythonExpr" + +df = pd.read_csv("filepath") +df.query("query") # $getCode="query" +df.eval("query") # $getCode="query" +df.copy().query("query") # $getCode="query" + +df = pd.read_fwf("filepath") +df.query("query") # $getCode="query" +df.eval("query") # $getCode="query" + + +df = pd.read_pickle("filepath") # $ decodeInput="filepath" decodeOutput=pd.read_pickle(..) decodeFormat=pickle decodeMayExecuteInput +df.query("query") # $getCode="query" +df.eval("query") # $getCode="query" + +df = pd.read_table("filepath") +df.query("query") # $getCode="query" +df.eval("query") # $getCode="query" + +df = pd.read_clipboard("filepath") +df.query("query") # $getCode="query" +df.eval("query") # $getCode="query" + +df = pd.read_excel("filepath") +df.query("query") # $getCode="query" +df.eval("query") # $getCode="query" + +df = pd.read_html("filepath") +df[0].query("query") # $getCode="query" + +df = pd.read_xml("filepath") +df.query("query") # $getCode="query" +df.eval("query") # $getCode="query" + +df = pd.read_parquet("filepath") +df.query("query") # $getCode="query" +df.eval("query") # $getCode="query" + +df = pd.read_orc("filepath") +df.query("query") # $getCode="query" +df.eval("query") # $getCode="query" + +df = pd.read_spss("filepath") +df.query("query") # $getCode="query" +df.eval("query") # $getCode="query" + +df = pd.read_sql_table("filepath", 'postgres:///db_name') +df.query("query") # $getCode="query" +df.eval("query") # $getCode="query" + +df = pd.read_sql_query("filepath", 'postgres:///db_name') +df.query("query") # $getCode="query" +df.eval("query") # $getCode="query" + +df = pd.read_sql("filepath", 'postgres:///db_name') +df.query("query") # $getCode="query" +df.eval("query") # $getCode="query" + +df = pd.read_gbq("filepath") +df.query("query") # $getCode="query" +df.eval("query") # $getCode="query" + +df = pd.read_stata("filepath") +df.query("query") # $getCode="query" +df.eval("query") # $getCode="query" + +df = pd.read_sas("filepath") +df.query("query") # $getCode="query" +df.eval("query") # $getCode="query" +df = pd.read_sas("filepath", iterator=True, chunksize=1) +df.query("query") +df = pd.read_sas("filepath", iterator=False, chunksize=1) +df.query("query") +df = pd.read_sas("filepath", iterator=True, chunksize=None) +df.query("query") +df = pd.read_sas("filepath", iterator=False, chunksize=None) +df.query("query") # $getCode="query" +df.eval("query") # $getCode="query"