From 3448751b4cef97f4b59857ea30cc29ac8191bc49 Mon Sep 17 00:00:00 2001
From: Rasmus Wriedt Larsen <rasmuswl@github.com>
Date: Mon, 19 Aug 2024 14:41:21 +0200
Subject: [PATCH] JS: Consolidate command-line argument modeling

Such that we can reuse the existing modeling, but have it globally
applied as a threat-model as well.

I Basically just moved the modeling. One important aspect is that this
changes is that the previously query-specific `argsParseStep` is now a
globally applied taint-step. This seems reasonable, if someone applied
the argument parsing to any user-controlled string, it seems correct to
propagate that taint for _any_ query.
---
 javascript/ql/lib/javascript.qll              |   1 +
 .../frameworks/CommandLineArguments.qll       | 142 ++++++++++++++++++
 ...IndirectCommandInjectionCustomizations.qll | 121 +--------------
 .../IndirectCommandInjectionQuery.qll         |   4 -
 .../threat-models/sources/sources.js          |  40 +++++
 5 files changed, 186 insertions(+), 122 deletions(-)
 create mode 100644 javascript/ql/lib/semmle/javascript/frameworks/CommandLineArguments.qll

diff --git a/javascript/ql/lib/javascript.qll b/javascript/ql/lib/javascript.qll
index 07fb759bd65..7bb2b767610 100644
--- a/javascript/ql/lib/javascript.qll
+++ b/javascript/ql/lib/javascript.qll
@@ -81,6 +81,7 @@ import semmle.javascript.frameworks.Classnames
 import semmle.javascript.frameworks.ClassValidator
 import semmle.javascript.frameworks.ClientRequests
 import semmle.javascript.frameworks.ClosureLibrary
+import semmle.javascript.frameworks.CommandLineArguments
 import semmle.javascript.frameworks.CookieLibraries
 import semmle.javascript.frameworks.Credentials
 import semmle.javascript.frameworks.CryptoLibraries
diff --git a/javascript/ql/lib/semmle/javascript/frameworks/CommandLineArguments.qll b/javascript/ql/lib/semmle/javascript/frameworks/CommandLineArguments.qll
new file mode 100644
index 00000000000..db1444db741
--- /dev/null
+++ b/javascript/ql/lib/semmle/javascript/frameworks/CommandLineArguments.qll
@@ -0,0 +1,142 @@
+/** Provides modeling for parsed command line arguments. */
+
+import javascript
+
+/**
+ * An object containing command-line arguments, potentially parsed by a library.
+ *
+ * Extend this class to refine existing API models. If you want to model new APIs,
+ * extend `CommandLineArguments::Range` instead.
+ */
+class CommandLineArguments extends ThreatModelSource instanceof CommandLineArguments::Range { }
+
+/** Provides a class for modeling new sources of remote user input. */
+module CommandLineArguments {
+  /**
+   * An object containing command-line arguments, potentially parsed by a library.
+   *
+   * Extend this class to model new APIs. If you want to refine existing API models,
+   * extend `CommandLineArguments` instead.
+   */
+  abstract class Range extends ThreatModelSource::Range {
+    override string getThreatModel() { result = "commandargs" }
+
+    override string getSourceType() { result = "CommandLineArguments" }
+  }
+}
+
+/** A read of `process.argv`, considered as a threat-model source. */
+private class ProcessArgv extends CommandLineArguments::Range {
+  ProcessArgv() {
+    // `process.argv[0]` and `process.argv[1]` are paths to `node` and `main`, and
+    // therefore should not be considered a threat-source... However, we don't have an
+    // easy way to exclude them, so we need to allow them.
+    this = NodeJSLib::process().getAPropertyRead("argv")
+  }
+
+  override string getSourceType() { result = "process.argv" }
+}
+
+private class DefaultModels extends CommandLineArguments::Range {
+  DefaultModels() {
+    // `require('get-them-args')(...)` => `{ unknown: [], a: ... b: ... }`
+    this = DataFlow::moduleImport("get-them-args").getACall()
+    or
+    // `require('optimist').argv` => `{ _: [], a: ... b: ... }`
+    this = DataFlow::moduleMember("optimist", "argv")
+    or
+    // `require("arg")({...spec})` => `{_: [], a: ..., b: ...}`
+    this = DataFlow::moduleImport("arg").getACall()
+    or
+    // `(new (require(argparse)).ArgumentParser({...spec})).parse_args()` => `{a: ..., b: ...}`
+    this =
+      API::moduleImport("argparse")
+          .getMember("ArgumentParser")
+          .getInstance()
+          .getMember("parse_args")
+          .getACall()
+    or
+    // `require('command-line-args')({...spec})` => `{a: ..., b: ...}`
+    this = DataFlow::moduleImport("command-line-args").getACall()
+    or
+    // `require('meow')(help, {...spec})` => `{a: ..., b: ....}`
+    this = DataFlow::moduleImport("meow").getACall()
+    or
+    // `require("dashdash").createParser(...spec)` => `{a: ..., b: ...}`
+    this =
+      [
+        API::moduleImport("dashdash"),
+        API::moduleImport("dashdash").getMember("createParser").getReturn()
+      ].getMember("parse").getACall()
+    or
+    // `require('commander').myCmdArgumentName`
+    this = commander().getAMember().asSource()
+    or
+    // `require('commander').opt()` => `{a: ..., b: ...}`
+    this = commander().getMember("opts").getACall()
+  }
+}
+
+/**
+ * A step for propagating taint through command line parsing,
+ * such as `var succ = require("minimist")(pred)`.
+ */
+private class ArgsParseStep extends TaintTracking::SharedTaintStep {
+  override predicate step(DataFlow::Node pred, DataFlow::Node succ) {
+    exists(DataFlow::CallNode call |
+      call = DataFlow::moduleMember("args", "parse").getACall() or
+      call = DataFlow::moduleImport(["yargs-parser", "minimist", "subarg"]).getACall()
+    |
+      succ = call and
+      pred = call.getArgument(0)
+    )
+  }
+}
+
+/**
+ * Gets a Command instance from the `commander` library.
+ */
+private API::Node commander() {
+  result = API::moduleImport("commander")
+  or
+  // `require("commander").program === require("commander")`
+  result = commander().getMember("program")
+  or
+  result = commander().getMember("Command").getInstance()
+  or
+  // lots of chainable methods
+  result = commander().getAMember().getReturn()
+}
+
+/**
+ * Gets an instance of `yargs`.
+ * Either directly imported as a module, or through some chained method call.
+ */
+private DataFlow::SourceNode yargs() {
+  result = DataFlow::moduleImport("yargs")
+  or
+  // script used to generate list of chained methods: https://gist.github.com/erik-krogh/f8afe952c0577f4b563a993e613269ba
+  exists(string method |
+    not method =
+      // the methods that does not return a chained `yargs` object.
+      [
+        "getContext", "getDemandedOptions", "getDemandedCommands", "getDeprecatedOptions",
+        "_getParseContext", "getOptions", "getGroups", "getStrict", "getStrictCommands",
+        "getExitProcess", "locale", "getUsageInstance", "getCommandInstance"
+      ]
+  |
+    result = yargs().getAMethodCall(method)
+  )
+}
+
+/**
+ * An array of command line arguments (`argv`) parsed by the `yargs` library.
+ */
+private class YargsArgv extends CommandLineArguments::Range {
+  YargsArgv() {
+    this = yargs().getAPropertyRead("argv")
+    or
+    this = yargs().getAMethodCall("parse") and
+    this.(DataFlow::MethodCallNode).getNumArgument() = 0
+  }
+}
diff --git a/javascript/ql/lib/semmle/javascript/security/dataflow/IndirectCommandInjectionCustomizations.qll b/javascript/ql/lib/semmle/javascript/security/dataflow/IndirectCommandInjectionCustomizations.qll
index b2b94fcca8d..9dd6ab4b4a9 100644
--- a/javascript/ql/lib/semmle/javascript/security/dataflow/IndirectCommandInjectionCustomizations.qll
+++ b/javascript/ql/lib/semmle/javascript/security/dataflow/IndirectCommandInjectionCustomizations.qll
@@ -25,21 +25,6 @@ module IndirectCommandInjection {
    */
   abstract class Sanitizer extends DataFlow::Node { }
 
-  /**
-   * A source of user input from the command-line, considered as a flow source for command injection.
-   */
-  private class CommandLineArgumentsArrayAsSource extends Source instanceof CommandLineArgumentsArray
-  { }
-
-  /**
-   * An array of command-line arguments.
-   */
-  class CommandLineArgumentsArray extends DataFlow::SourceNode {
-    CommandLineArgumentsArray() {
-      this = DataFlow::globalVarRef("process").getAPropertyRead("argv")
-    }
-  }
-
   /**
    * A read of `process.env`, considered as a flow source for command injection.
    */
@@ -82,109 +67,9 @@ module IndirectCommandInjection {
   }
 
   /**
-   * An object containing parsed command-line arguments, considered as a flow source for command injection.
+   * An object containing command-line arguments, considered as a flow source for command injection.
    */
-  class ParsedCommandLineArgumentsAsSource extends Source {
-    ParsedCommandLineArgumentsAsSource() {
-      // `require('get-them-args')(...)` => `{ unknown: [], a: ... b: ... }`
-      this = DataFlow::moduleImport("get-them-args").getACall()
-      or
-      // `require('optimist').argv` => `{ _: [], a: ... b: ... }`
-      this = DataFlow::moduleMember("optimist", "argv")
-      or
-      // `require("arg")({...spec})` => `{_: [], a: ..., b: ...}`
-      this = DataFlow::moduleImport("arg").getACall()
-      or
-      // `(new (require(argparse)).ArgumentParser({...spec})).parse_args()` => `{a: ..., b: ...}`
-      this =
-        API::moduleImport("argparse")
-            .getMember("ArgumentParser")
-            .getInstance()
-            .getMember("parse_args")
-            .getACall()
-      or
-      // `require('command-line-args')({...spec})` => `{a: ..., b: ...}`
-      this = DataFlow::moduleImport("command-line-args").getACall()
-      or
-      // `require('meow')(help, {...spec})` => `{a: ..., b: ....}`
-      this = DataFlow::moduleImport("meow").getACall()
-      or
-      // `require("dashdash").createParser(...spec)` => `{a: ..., b: ...}`
-      this =
-        [
-          API::moduleImport("dashdash"),
-          API::moduleImport("dashdash").getMember("createParser").getReturn()
-        ].getMember("parse").getACall()
-      or
-      // `require('commander').myCmdArgumentName`
-      this = commander().getAMember().asSource()
-      or
-      // `require('commander').opt()` => `{a: ..., b: ...}`
-      this = commander().getMember("opts").getACall()
-    }
-  }
-
-  /**
-   * Holds if there is a command line parsing step from `pred` to `succ`.
-   * E.g: `var succ = require("minimist")(pred)`.
-   */
-  predicate argsParseStep(DataFlow::Node pred, DataFlow::Node succ) {
-    exists(DataFlow::CallNode call |
-      call = DataFlow::moduleMember("args", "parse").getACall() or
-      call = DataFlow::moduleImport(["yargs-parser", "minimist", "subarg"]).getACall()
-    |
-      succ = call and
-      pred = call.getArgument(0)
-    )
-  }
-
-  /**
-   * Gets a Command instance from the `commander` library.
-   */
-  private API::Node commander() {
-    result = API::moduleImport("commander")
-    or
-    // `require("commander").program === require("commander")`
-    result = commander().getMember("program")
-    or
-    result = commander().getMember("Command").getInstance()
-    or
-    // lots of chainable methods
-    result = commander().getAMember().getReturn()
-  }
-
-  /**
-   * Gets an instance of `yargs`.
-   * Either directly imported as a module, or through some chained method call.
-   */
-  private DataFlow::SourceNode yargs() {
-    result = DataFlow::moduleImport("yargs")
-    or
-    // script used to generate list of chained methods: https://gist.github.com/erik-krogh/f8afe952c0577f4b563a993e613269ba
-    exists(string method |
-      not method =
-        // the methods that does not return a chained `yargs` object.
-        [
-          "getContext", "getDemandedOptions", "getDemandedCommands", "getDeprecatedOptions",
-          "_getParseContext", "getOptions", "getGroups", "getStrict", "getStrictCommands",
-          "getExitProcess", "locale", "getUsageInstance", "getCommandInstance"
-        ]
-    |
-      result = yargs().getAMethodCall(method)
-    )
-  }
-
-  /**
-   * An array of command line arguments (`argv`) parsed by the `yargs` library.
-   */
-  class YargsArgv extends Source {
-    YargsArgv() {
-      this = yargs().getAPropertyRead("argv")
-      or
-      this = yargs().getAMethodCall("parse") and
-      this.(DataFlow::MethodCallNode).getNumArgument() = 0
-    }
-  }
+  private class CommandLineArgumentsAsSource extends Source instanceof CommandLineArguments { }
 
   /**
    * A command-line argument that effectively is system-controlled, and therefore not likely to be exploitable when used in the execution of another command.
@@ -193,7 +78,7 @@ module IndirectCommandInjection {
     SystemControlledCommandLineArgumentSanitizer() {
       // `process.argv[0]` and `process.argv[1]` are paths to `node` and `main`.
       exists(string index | index = "0" or index = "1" |
-        this = any(CommandLineArgumentsArray a).getAPropertyRead(index)
+        this = DataFlow::globalVarRef("process").getAPropertyRead("argv").getAPropertyRead(index)
       )
     }
   }
diff --git a/javascript/ql/lib/semmle/javascript/security/dataflow/IndirectCommandInjectionQuery.qll b/javascript/ql/lib/semmle/javascript/security/dataflow/IndirectCommandInjectionQuery.qll
index d2de26d5cd0..b3e59aec7bd 100644
--- a/javascript/ql/lib/semmle/javascript/security/dataflow/IndirectCommandInjectionQuery.qll
+++ b/javascript/ql/lib/semmle/javascript/security/dataflow/IndirectCommandInjectionQuery.qll
@@ -28,8 +28,4 @@ class Configuration extends TaintTracking::Configuration {
   override predicate isSink(DataFlow::Node sink) { this.isSinkWithHighlight(sink, _) }
 
   override predicate isSanitizer(DataFlow::Node node) { node instanceof Sanitizer }
-
-  override predicate isAdditionalTaintStep(DataFlow::Node pred, DataFlow::Node succ) {
-    argsParseStep(pred, succ)
-  }
 }
diff --git a/javascript/ql/test/library-tests/threat-models/sources/sources.js b/javascript/ql/test/library-tests/threat-models/sources/sources.js
index 0194af87f6e..0e9e73f57be 100644
--- a/javascript/ql/test/library-tests/threat-models/sources/sources.js
+++ b/javascript/ql/test/library-tests/threat-models/sources/sources.js
@@ -2,3 +2,43 @@ import 'dummy';
 
 var x = process.env['foo']; // $ threat-source=environment
 SINK(x); // $ hasFlow
+
+var y = process.argv[2]; // $ threat-source=commandargs
+SINK(y); // $ hasFlow
+
+
+// Accessing command line arguments using yargs
+// https://www.npmjs.com/package/yargs/v/17.7.2
+const yargs = require('yargs/yargs');
+const { hideBin } = require('yargs/helpers');
+const argv = yargs(hideBin(process.argv)).argv; // $ threat-source=commandargs
+
+SINK(argv.foo); // $ MISSING: hasFlow
+
+// older version
+// https://www.npmjs.com/package/yargs/v/7.1.2
+const yargsOld = require('yargs');
+const argvOld = yargsOld.argv; // $ threat-source=commandargs
+
+SINK(argvOld.foo); // $ hasFlow
+
+// Accessing command line arguments using yargs-parser
+const yargsParser = require('yargs-parser');
+const src = process.argv.slice(2); // $ threat-source=commandargs
+const parsedArgs = yargsParser(src);
+
+SINK(parsedArgs.foo); // $ hasFlow
+
+// Accessing command line arguments using minimist
+const minimist = require('minimist');
+const args = minimist(process.argv.slice(2)); // $ threat-source=commandargs
+
+SINK(args.foo); // $ hasFlow
+
+
+// Accessing command line arguments using commander
+const { Command } = require('commander'); // $ SPURIOUS: threat-source=commandargs
+const program = new Command();
+program.parse(process.argv); // $ threat-source=commandargs
+
+SINK(program.opts().foo); // $ hasFlow SPURIOUS: threat-source=commandargs