Python: Add py/weak-sensitive-data-hashing query

2026-04-30 19:26:02 +02:00 · 2021-04-22 11:49:38 +02:00
parent 499adc26a3
commit ac83c695ad
12 changed files with 488 additions and 0 deletions
--- a/python/ql/src/Security/CWE-327/WeakSensitiveDataHashing.qhelp
+++ b/python/ql/src/Security/CWE-327/WeakSensitiveDataHashing.qhelp
@@ -0,0 +1,103 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+<qhelp>
+     <overview>
+          <p>
+               Using a broken or weak cryptographic hash function can leave data
+               vulnerable, and should not be used in security related code.
+          </p>
+
+          <p>
+               A strong cryptographic hash function should be resistant to:
+          </p>
+          <ul>
+               <li>
+                    pre-image attacks: if you know a hash value <code>h(x)</code>,
+                    you should not be able to easily find the input <code>x</code>.
+               </li>
+               <li>
+                    collision attacks: if you know a hash value <code>h(x)</code>,
+                    you should not be able to easily find a different input <code>y</code>
+                    such that hash value is the same <code>h(x) = h(y)</code>.
+               </li>
+          </ul>
+          <p>
+               In cases with a limited input space, such as for passwords, the hash
+               function also needs to be computationally expensive to be resistant to
+               brute-force attacks.
+          </p>
+
+          <p>
+               As an example, both MD5 and SHA-1 is known to be vulnerable to collision attacks.
+          </p>
+
+          <p>
+               Since it's OK to use a weak cryptographic hash function in a non-security
+               context, this query only alerts when these are used to hash sensitive
+               data (such as passwords, certificates, usernames).
+          </p>
+
+          <p>
+               Use of broken or weak cryptographic algorithms that are not hashing algorithms, is
+               handled by the <code>py/weak-cryptographic-algorithm</code> query.
+          </p>
+
+     </overview>
+     <recommendation>
+
+          <p>
+               Ensure that you use a strong, modern cryptographic hash function:
+          </p>
+
+          <ul>
+               <li>
+                    such as Argon2, scrypt, bcrypt, or PBKDF2 for passwords and other data with limited input space.
+               </li>
+               <li>
+                    such as SHA-2, or SHA-3 in other cases.
+               </li>
+          </ul>
+
+     </recommendation>
+     <example>
+
+          <p>
+               The following example shows two functions for checking whether the hash
+               of a certificate matches a known value -- to prevent tampering.
+
+               The first function uses MD5 that is known to be vulnerable to collision attacks.
+
+               The second function uses SHA-256 that is a strong cryptographic hashing function.
+          </p>
+
+          <sample src="examples/weak_certificate_hashing.py" />
+
+     </example>
+     <example>
+          <p>
+               The following example shows two functions for hashing passwords.
+
+               The first function uses SHA-256 to hash passwords. Although SHA-256 is a
+               strong cryptographic hash function, it is not suitable for password
+               hashing since it is not computationally expensive.
+          </p>
+
+          <sample src="examples/weak_password_hashing_bad.py" />
+
+
+          <p>
+               The second function uses Argon2 (through the <code>argon2-cffi</code>
+               PyPI package), which is a strong password hashing algorithm (and
+               includes a per-password salt by default).
+          </p>
+
+          <sample src="examples/weak_password_hashing_good.py" />
+
+     </example>
+
+     <references>
+          <li>OWASP: <a href="https://cheatsheetseries.owasp.org/cheatsheets/Password_Storage_Cheat_Sheet.html">Password Storage Cheat Sheet</a></li>
+     </references>
+
+</qhelp>
--- a/python/ql/src/Security/CWE-327/WeakSensitiveDataHashing.ql
+++ b/python/ql/src/Security/CWE-327/WeakSensitiveDataHashing.ql
@@ -0,0 +1,47 @@
+/**
+ * @name Use of a broken or weak cryptographic hashing algorithm on sensitive data
+ * @description Using broken or weak cryptographic hashing algorithms can compromise security.
+ * @kind path-problem
+ * @problem.severity warning
+ * @precision high
+ * @id py/weak-sensitive-data-hashing
+ * @tags security
+ *       external/cwe/cwe-327
+ *       external/cwe/cwe-916
+ */
+
+import python
+import semmle.python.security.dataflow.WeakSensitiveDataHashing
+import semmle.python.dataflow.new.DataFlow
+import semmle.python.dataflow.new.TaintTracking
+import DataFlow::PathGraph
+
+from
+  DataFlow::PathNode source, DataFlow::PathNode sink, string ending, string algorithmName,
+  string classification
+where
+  exists(NormalHashFunction::Configuration config |
+    config.hasFlowPath(source, sink) and
+    algorithmName = sink.getNode().(NormalHashFunction::Sink).getAlgorithmName() and
+    classification = source.getNode().(NormalHashFunction::Source).getClassification() and
+    ending = "."
+  )
+  or
+  exists(ComputationallyExpensiveHashFunction::Configuration config |
+    config.hasFlowPath(source, sink) and
+    algorithmName = sink.getNode().(ComputationallyExpensiveHashFunction::Sink).getAlgorithmName() and
+    classification =
+      source.getNode().(ComputationallyExpensiveHashFunction::Source).getClassification() and
+    (
+      sink.getNode().(ComputationallyExpensiveHashFunction::Sink).isComputationallyExpensive() and
+      ending = "."
+      or
+      not sink.getNode().(ComputationallyExpensiveHashFunction::Sink).isComputationallyExpensive() and
+      ending =
+        " for " + classification +
+          " hashing, since it is not a computationally expensive hash function."
+    )
+  )
+select sink.getNode(), source, sink,
+  "$@ is used in a hashing algorithm (" + algorithmName + ") that is insecure" + ending,
+  source.getNode(), "Sensitive data (" + classification + ")"
--- a/python/ql/src/Security/CWE-327/examples/weak_certificate_hashing.py
+++ b/python/ql/src/Security/CWE-327/examples/weak_certificate_hashing.py
@@ -0,0 +1,9 @@
+import hashlib
+
+def certificate_matches_known_hash_bad(certificate, known_hash):
+    hash = hashlib.md5(certificate).hexdigest() # BAD
+    return hash == known_hash
+
+def certificate_matches_known_hash_good(certificate, known_hash):
+    hash = hashlib.sha256(certificate).hexdigest() # GOOD
+    return hash == known_hash
--- a/python/ql/src/Security/CWE-327/examples/weak_password_hashing_bad.py
+++ b/python/ql/src/Security/CWE-327/examples/weak_password_hashing_bad.py
@@ -0,0 +1,4 @@
+import hashlib
+
+def get_password_hash(password: str, salt: str):
+    return hashlib.sha256(password + salt).hexdigest() # BAD
--- a/python/ql/src/Security/CWE-327/examples/weak_password_hashing_good.py
+++ b/python/ql/src/Security/CWE-327/examples/weak_password_hashing_good.py
@@ -0,0 +1,9 @@
+from argon2 import PasswordHasher
+
+def get_initial_hash(password: str):
+    ph = PasswordHasher()
+    return ph.hash(password) # GOOD
+
+def check_password(password: str, known_hash):
+    ph = PasswordHasher()
+    return ph.verify(known_hash, password) # GOOD
--- a/python/ql/src/semmle/python/security/dataflow/WeakSensitiveDataHashing.qll
+++ b/python/ql/src/semmle/python/security/dataflow/WeakSensitiveDataHashing.qll
@@ -0,0 +1,74 @@
+/**
+ * Provides a taint-tracking configuration for detecting use of a broken or weak
+ * cryptographic hashing algorithm on sensitive data.
+ *
+ * Note, for performance reasons: only import this file if
+ * `WeakSensitiveDataHashing::Configuration` is needed, otherwise
+ * `SqlInjectionCustomizations` should be imported instead.
+ */
+
+private import python
+private import semmle.python.dataflow.new.DataFlow
+private import semmle.python.dataflow.new.TaintTracking
+private import semmle.python.Concepts
+private import semmle.python.dataflow.new.RemoteFlowSources
+private import semmle.python.dataflow.new.BarrierGuards
+
+/**
+ * Provides a taint-tracking configuration for detecting use of a broken or weak
+ * cryptographic hash function on sensitive data, that does NOT require a
+ * computationally expensive hash function.
+ */
+module NormalHashFunction {
+  import WeakSensitiveDataHashingCustomizations::NormalHashFunction
+
+  /**
+   * A taint-tracking configuration for detecting use of a broken or weak
+   * cryptographic hashing algorithm on sensitive data.
+   */
+  class Configuration extends TaintTracking::Configuration {
+    Configuration() { this = "NormalHashFunction" }
+
+    override predicate isSource(DataFlow::Node source) { source instanceof Source }
+
+    override predicate isSink(DataFlow::Node sink) { sink instanceof Sink }
+
+    override predicate isSanitizer(DataFlow::Node node) {
+      super.isSanitizer(node)
+      or
+      node instanceof Sanitizer
+    }
+  }
+}
+
+/**
+ * Provides a taint-tracking configuration for detecting use of a broken or weak
+ * cryptographic hashing algorithm on passwords.
+ *
+ * Passwords has stricter requirements on the hashing algorithm used (must be
+ * computationally expensive to prevent brute-force attacks).
+ */
+module ComputationallyExpensiveHashFunction {
+  import WeakSensitiveDataHashingCustomizations::ComputationallyExpensiveHashFunction
+
+  /**
+   * A taint-tracking configuration for detecting use of a broken or weak
+   * cryptographic hashing algorithm on passwords.
+   *
+   * Passwords has stricter requirements on the hashing algorithm used (must be
+   * computationally expensive to prevent brute-force attacks).
+   */
+  class Configuration extends TaintTracking::Configuration {
+    Configuration() { this = "ComputationallyExpensiveHashFunction" }
+
+    override predicate isSource(DataFlow::Node source) { source instanceof Source }
+
+    override predicate isSink(DataFlow::Node sink) { sink instanceof Sink }
+
+    override predicate isSanitizer(DataFlow::Node node) {
+      super.isSanitizer(node)
+      or
+      node instanceof Sanitizer
+    }
+  }
+}
--- a/python/ql/src/semmle/python/security/dataflow/WeakSensitiveDataHashingCustomizations.qll
+++ b/python/ql/src/semmle/python/security/dataflow/WeakSensitiveDataHashingCustomizations.qll
@@ -0,0 +1,157 @@
+/**
+ * Provides default sources, sinks and sanitizers for detecting
+ * "use of a broken or weak cryptographic hashing algorithm on sensitive data"
+ * vulnerabilities, as well as extension points for adding your own.
+ */
+
+private import python
+private import semmle.python.dataflow.new.DataFlow
+private import semmle.python.dataflow.new.TaintTracking
+private import semmle.python.Concepts
+private import semmle.python.dataflow.new.SensitiveDataSources
+private import semmle.python.dataflow.new.BarrierGuards
+
+/**
+ * Provides default sources, sinks and sanitizers for detecting
+ * "use of a broken or weak cryptographic hashing algorithm on sensitive data"
+ * vulnerabilities on sensitive data that does NOT require computationally expensive
+ * hashing, as well as extension points for adding your own.
+ *
+ * Also see the `ComputationallyExpensiveHashFunction` module.
+ */
+module NormalHashFunction {
+  /**
+   * A data flow source for "use of a broken or weak cryptographic hashing algorithm on
+   * sensitive data" vulnerabilities.
+   */
+  abstract class Source extends DataFlow::Node {
+    Source() { not this instanceof ComputationallyExpensiveHashFunction::Source }
+
+    /** Gets the classification of the sensitive data. */
+    abstract string getClassification();
+  }
+
+  /**
+   * A data flow sink for "use of a broken or weak cryptographic hashing algorithm on
+   * sensitive data" vulnerabilities.
+   */
+  abstract class Sink extends DataFlow::Node {
+    /**
+     * Gets the name of the weak hashing algorithm.
+     */
+    abstract string getAlgorithmName();
+  }
+
+  /**
+   * A sanitizer for "use of a broken or weak cryptographic hashing algorithm on
+   * sensitive data" vulnerabilities.
+   */
+  abstract class Sanitizer extends DataFlow::Node { }
+
+  /**
+   * A source of sensitive data, considered as a flow source.
+   */
+  class SensitiveDataSourceAsSource extends Source, SensitiveDataSource {
+    override string getClassification() { result = SensitiveDataSource.super.getClassification() }
+  }
+
+  /** The input to a hashing operation using a weak algorithm, considered as a flow sink. */
+  class WeakHashingOperationInputSink extends Sink {
+    Cryptography::HashingAlgorithm algorithm;
+
+    WeakHashingOperationInputSink() {
+      exists(Cryptography::CryptographicOperation operation |
+        algorithm = operation.getAlgorithm() and
+        algorithm.isWeak() and
+        this = operation.getAnInput()
+      )
+    }
+
+    override string getAlgorithmName() { result = algorithm.getName() }
+  }
+}
+
+/**
+ * Provides default sources, sinks and sanitizers for detecting
+ * "use of a broken or weak cryptographic hashing algorithm on sensitive data"
+ * vulnerabilities on sensitive data that DOES require computationally expensive
+ * hashing, as well as extension points for adding your own.
+ *
+ * Also see the `NormalHashFunction` module.
+ */
+module ComputationallyExpensiveHashFunction {
+  /**
+   * A data flow source of sensitive data that requires computationally expensive
+   * hashing for "use of a broken or weak cryptographic hashing algorithm on sensitive
+   * data" vulnerabilities.
+   */
+  abstract class Source extends DataFlow::Node {
+    /** Gets the classification of the sensitive data. */
+    abstract string getClassification();
+  }
+
+  /**
+   * A data flow sink for sensitive data that requires computationally expensive
+   * hashing for "use of a broken or weak cryptographic hashing algorithm on sensitive
+   * data" vulnerabilities.
+   */
+  abstract class Sink extends DataFlow::Node {
+    /**
+     * Gets the name of the weak hashing algorithm.
+     */
+    abstract string getAlgorithmName();
+
+    /**
+     * Holds if this sink is for a computationally expensive hash function (meaning that
+     * hash function is just weak in some other regard.
+     */
+    abstract predicate isComputationallyExpensive();
+  }
+
+  /**
+   * A sanitizer of sensitive data that requires computationally expensive
+   * hashing for "use of a broken or weak cryptographic hashing
+   * algorithm on sensitive data" vulnerabilities.
+   */
+  abstract class Sanitizer extends DataFlow::Node { }
+
+  /**
+   * A source of passwords, considered as a flow source.
+   */
+  class PasswordSourceAsSource extends Source, SensitiveDataSource {
+    PasswordSourceAsSource() {
+      // TODO: once https://github.com/github/codeql/pull/5739 has been merged,
+      // don't use hardcoded value anymore
+      SensitiveDataSource.super.getClassification() = "password"
+    }
+
+    override string getClassification() { result = SensitiveDataSource.super.getClassification() }
+  }
+
+  /**
+   * The input to a password hashing operation using a weak algorithm, considered as a
+   * flow sink.
+   */
+  class WeakPasswordHashingOperationInputSink extends Sink {
+    Cryptography::CryptographicAlgorithm algorithm;
+
+    WeakPasswordHashingOperationInputSink() {
+      (
+        algorithm instanceof Cryptography::PasswordHashingAlgorithm and
+        algorithm.isWeak()
+        or
+        algorithm instanceof Cryptography::HashingAlgorithm
+      ) and
+      exists(Cryptography::CryptographicOperation operation |
+        algorithm = operation.getAlgorithm() and
+        this = operation.getAnInput()
+      )
+    }
+
+    override string getAlgorithmName() { result = algorithm.getName() }
+
+    override predicate isComputationallyExpensive() {
+      algorithm instanceof Cryptography::PasswordHashingAlgorithm
+    }
+  }
+}
--- a/python/ql/test/query-tests/Security/CWE-327-WeakSensitiveDataHashing/README.md
+++ b/python/ql/test/query-tests/Security/CWE-327-WeakSensitiveDataHashing/README.md
@@ -0,0 +1,3 @@
+Note that the tests in this directory are very shallow, and simply show that the query is able to produce alerts.
+
+More in-depth tests can be found for the individual frameworks that we have modeled `Cryptography::CryptographicOperation` for.
--- a/python/ql/test/query-tests/Security/CWE-327-WeakSensitiveDataHashing/WeakSensitiveDataHashing.expected
+++ b/python/ql/test/query-tests/Security/CWE-327-WeakSensitiveDataHashing/WeakSensitiveDataHashing.expected
@@ -0,0 +1,27 @@
+edges
+| test_cryptodome.py:6:17:6:33 | ControlFlowNode for get_certificate() | test_cryptodome.py:8:19:8:27 | ControlFlowNode for dangerous |
+| test_cryptodome.py:13:17:13:30 | ControlFlowNode for get_password() | test_cryptodome.py:15:19:15:27 | ControlFlowNode for dangerous |
+| test_cryptodome.py:20:17:20:30 | ControlFlowNode for get_password() | test_cryptodome.py:24:19:24:27 | ControlFlowNode for dangerous |
+| test_cryptography.py:7:17:7:33 | ControlFlowNode for get_certificate() | test_cryptography.py:9:19:9:27 | ControlFlowNode for dangerous |
+| test_cryptography.py:15:17:15:30 | ControlFlowNode for get_password() | test_cryptography.py:17:19:17:27 | ControlFlowNode for dangerous |
+| test_cryptography.py:23:17:23:30 | ControlFlowNode for get_password() | test_cryptography.py:27:19:27:27 | ControlFlowNode for dangerous |
+nodes
+| test_cryptodome.py:6:17:6:33 | ControlFlowNode for get_certificate() | semmle.label | ControlFlowNode for get_certificate() |
+| test_cryptodome.py:8:19:8:27 | ControlFlowNode for dangerous | semmle.label | ControlFlowNode for dangerous |
+| test_cryptodome.py:13:17:13:30 | ControlFlowNode for get_password() | semmle.label | ControlFlowNode for get_password() |
+| test_cryptodome.py:15:19:15:27 | ControlFlowNode for dangerous | semmle.label | ControlFlowNode for dangerous |
+| test_cryptodome.py:20:17:20:30 | ControlFlowNode for get_password() | semmle.label | ControlFlowNode for get_password() |
+| test_cryptodome.py:24:19:24:27 | ControlFlowNode for dangerous | semmle.label | ControlFlowNode for dangerous |
+| test_cryptography.py:7:17:7:33 | ControlFlowNode for get_certificate() | semmle.label | ControlFlowNode for get_certificate() |
+| test_cryptography.py:9:19:9:27 | ControlFlowNode for dangerous | semmle.label | ControlFlowNode for dangerous |
+| test_cryptography.py:15:17:15:30 | ControlFlowNode for get_password() | semmle.label | ControlFlowNode for get_password() |
+| test_cryptography.py:17:19:17:27 | ControlFlowNode for dangerous | semmle.label | ControlFlowNode for dangerous |
+| test_cryptography.py:23:17:23:30 | ControlFlowNode for get_password() | semmle.label | ControlFlowNode for get_password() |
+| test_cryptography.py:27:19:27:27 | ControlFlowNode for dangerous | semmle.label | ControlFlowNode for dangerous |
+#select
+| test_cryptodome.py:8:19:8:27 | ControlFlowNode for dangerous | test_cryptodome.py:6:17:6:33 | ControlFlowNode for get_certificate() | test_cryptodome.py:8:19:8:27 | ControlFlowNode for dangerous | $@ is used in a hashing algorithm (MD5) that is insecure. | test_cryptodome.py:6:17:6:33 | ControlFlowNode for get_certificate() | Sensitive data (certificate) |
+| test_cryptodome.py:15:19:15:27 | ControlFlowNode for dangerous | test_cryptodome.py:13:17:13:30 | ControlFlowNode for get_password() | test_cryptodome.py:15:19:15:27 | ControlFlowNode for dangerous | $@ is used in a hashing algorithm (MD5) that is insecure for password hashing, since it is not a computationally expensive hash function. | test_cryptodome.py:13:17:13:30 | ControlFlowNode for get_password() | Sensitive data (password) |
+| test_cryptodome.py:24:19:24:27 | ControlFlowNode for dangerous | test_cryptodome.py:20:17:20:30 | ControlFlowNode for get_password() | test_cryptodome.py:24:19:24:27 | ControlFlowNode for dangerous | $@ is used in a hashing algorithm (SHA256) that is insecure for password hashing, since it is not a computationally expensive hash function. | test_cryptodome.py:20:17:20:30 | ControlFlowNode for get_password() | Sensitive data (password) |
+| test_cryptography.py:9:19:9:27 | ControlFlowNode for dangerous | test_cryptography.py:7:17:7:33 | ControlFlowNode for get_certificate() | test_cryptography.py:9:19:9:27 | ControlFlowNode for dangerous | $@ is used in a hashing algorithm (MD5) that is insecure. | test_cryptography.py:7:17:7:33 | ControlFlowNode for get_certificate() | Sensitive data (certificate) |
+| test_cryptography.py:17:19:17:27 | ControlFlowNode for dangerous | test_cryptography.py:15:17:15:30 | ControlFlowNode for get_password() | test_cryptography.py:17:19:17:27 | ControlFlowNode for dangerous | $@ is used in a hashing algorithm (MD5) that is insecure for password hashing, since it is not a computationally expensive hash function. | test_cryptography.py:15:17:15:30 | ControlFlowNode for get_password() | Sensitive data (password) |
+| test_cryptography.py:27:19:27:27 | ControlFlowNode for dangerous | test_cryptography.py:23:17:23:30 | ControlFlowNode for get_password() | test_cryptography.py:27:19:27:27 | ControlFlowNode for dangerous | $@ is used in a hashing algorithm (SHA256) that is insecure for password hashing, since it is not a computationally expensive hash function. | test_cryptography.py:23:17:23:30 | ControlFlowNode for get_password() | Sensitive data (password) |
--- a/python/ql/test/query-tests/Security/CWE-327-WeakSensitiveDataHashing/WeakSensitiveDataHashing.qlref
+++ b/python/ql/test/query-tests/Security/CWE-327-WeakSensitiveDataHashing/WeakSensitiveDataHashing.qlref
@@ -0,0 +1 @@
+Security/CWE-327/WeakSensitiveDataHashing.ql
--- a/python/ql/test/query-tests/Security/CWE-327-WeakSensitiveDataHashing/test_cryptodome.py
+++ b/python/ql/test/query-tests/Security/CWE-327-WeakSensitiveDataHashing/test_cryptodome.py
@@ -0,0 +1,25 @@
+from Cryptodome.Hash import MD5, SHA256
+from my_module import get_password, get_certificate
+
+
+def get_badly_hashed_certificate():
+    dangerous = get_certificate()
+    hasher = MD5.new()
+    hasher.update(dangerous) # NOT OK
+    return hasher.hexdigest()
+
+
+def get_badly_hashed_password():
+    dangerous = get_password()
+    hasher = MD5.new()
+    hasher.update(dangerous) # NOT OK
+    return hasher.hexdigest()
+
+
+def get_badly_hashed_password2():
+    dangerous = get_password()
+    # Although SHA-256 is a strong cryptographic hash functions,
+    # it is not suitable for password hashing.
+    hasher = SHA256.new()
+    hasher.update(dangerous) # NOT OK
+    return hasher.hexdigest()
--- a/python/ql/test/query-tests/Security/CWE-327-WeakSensitiveDataHashing/test_cryptography.py
+++ b/python/ql/test/query-tests/Security/CWE-327-WeakSensitiveDataHashing/test_cryptography.py
@@ -0,0 +1,29 @@
+from cryptography.hazmat.primitives import hashes
+from binascii import hexlify
+from my_module import get_password, get_certificate
+
+
+def get_badly_hashed_certificate():
+    dangerous = get_certificate()
+    hasher = hashes.Hash(hashes.MD5())
+    hasher.update(dangerous) # NOT OK
+    digest = hasher.finalize()
+    return hexlify(digest).decode("utf-8")
+
+
+def get_badly_hashed_password():
+    dangerous = get_password()
+    hasher = hashes.Hash(hashes.MD5())
+    hasher.update(dangerous) # NOT OK
+    digest = hasher.finalize()
+    return hexlify(digest).decode("utf-8")
+
+
+def get_badly_hashed_password2():
+    dangerous = get_password()
+    # Although SHA-256 is a strong cryptographic hash functions,
+    # it is not suitable for password hashing.
+    hasher = hashes.Hash(hashes.SHA256())
+    hasher.update(dangerous) # NOT OK
+    digest = hasher.finalize()
+    return hexlify(digest).decode("utf-8")
				`@@ -0,0 +1 @@`
				`Security/CWE-327/WeakSensitiveDataHashing.ql`