Python: Fix bad join in StrConst::isUnicode

Also fixes a bug ("`B`" was not recognised as a bytestring prefix). The basic idea behind this fix is that the set of possible prefixes is fairly small, so it's easier just to precompute them, and then join them with the entire prefix of the string in question (rather than look at each string in isolation, get its prefix, and _then_ check whether it looks like it's a unicode string prefix, which essentially is what the code did before).
2026-05-01 03:35:13 +02:00 · 2020-11-05 16:45:27 +01:00
parent 1251bc57f5
commit bae4acabb1
1 changed files with 29 additions and 7 deletions
--- a/python/ql/src/semmle/python/Exprs.qll
+++ b/python/ql/src/semmle/python/Exprs.qll
@@ -584,18 +584,40 @@ class Slice extends Slice_ {
  }
 }

+/**
+ * Returns all string prefixes in the database that are explicitly marked as Unicode strings.
+ *
+ * Helper predicate for `StrConst::isUnicode`.
+ */
+pragma[nomagic]
+private string unicode_prefix() {
+  result = any(Str_ s).getPrefix() and
+  result.charAt(_) in ["u", "U"]
+}
+
+/**
+ * Returns all string prefixes in the database that are _not_ explicitly marked as bytestrings.
+ *
+ * Helper predicate for `StrConst::isUnicode`.
+ */
+pragma[nomagic]
+private string non_byte_prefix() {
+  result = any(Str_ s).getPrefix() and
+  not result.charAt(_) in ["b", "B"]
+}
+
 /** A string constant. */
 class StrConst extends Str_, ImmutableLiteral {
  /* syntax: "hello" */
  predicate isUnicode() {
-    this.getPrefix().charAt(_) = "u"
+    this.getPrefix() = unicode_prefix()
    or
-    this.getPrefix().charAt(_) = "U"
-    or
-    not this.getPrefix().charAt(_) = "b" and major_version() = 3
-    or
-    not this.getPrefix().charAt(_) = "b" and
-    this.getEnclosingModule().hasFromFuture("unicode_literals")
+    this.getPrefix() = non_byte_prefix() and
+    (
+      major_version() = 3
+      or
+      this.getEnclosingModule().hasFromFuture("unicode_literals")
+    )
  }

  deprecated override string strValue() { result = this.getS() }