add a Python implementation of RegexTreeViewSig

2025-12-24 12:46:34 +01:00 · 2022-11-01 12:02:48 +01:00
parent 5fbcbbc584
commit 1aeaefca7f
1 changed files with 89 additions and 27 deletions
--- a/python/ql/lib/semmle/python/RegexTreeView.qll
+++ b/python/ql/lib/semmle/python/RegexTreeView.qll
@@ -2,6 +2,10 @@
 import python
 private import semmle.python.regex
 private import codeql.regex.nfa.NfaUtils as NfaUtils
 private import codeql.regex.RegexTreeView
 // exporting as RegexTreeView, and in the top-level scope.
 import Impl as RegexTreeView
 import Impl
 /** Gets the parse tree resulting from parsing `re`, if such has been constructed. */
@@ -52,8 +56,34 @@ private newtype TRegExpParent =
  /** A back reference */
  TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) }
 pragma[nomagic]
 private int seqChildEnd(Regex re, int start, int end, int i) {
  result = seqChild(re, start, end, i).getEnd()
 }
 // moved out so we can use it in the charpred
 private RegExpTerm seqChild(Regex re, int start, int end, int i) {
  re.sequence(start, end) and
  (
    i = 0 and
    result.getRegex() = re and
    result.getStart() = start and
    exists(int itemEnd |
      re.item(start, itemEnd) and
      result.getEnd() = itemEnd
    )
    or
    i > 0 and
    result.getRegex() = re and
    exists(int itemStart | itemStart = seqChildEnd(re, start, end, i - 1) |
      result.getStart() = itemStart and
      re.item(itemStart, result.getEnd())
    )
  )
 }
 /** An implementation that statisfies the RegexTreeView signature. */
-module Impl {
+module Impl implements RegexTreeViewSig {
  /**
   * An element containing a regular expression term, that is, either
   * a string literal (parsed as a regular expression)
@@ -391,32 +421,6 @@ module Impl {
    override string getPrimaryQLClass() { result = "RegExpSequence" }
  }
  pragma[nomagic]
  private int seqChildEnd(Regex re, int start, int end, int i) {
    result = seqChild(re, start, end, i).getEnd()
  }
  // moved out so we can use it in the charpred
  private RegExpTerm seqChild(Regex re, int start, int end, int i) {
    re.sequence(start, end) and
    (
      i = 0 and
      result.getRegex() = re and
      result.getStart() = start and
      exists(int itemEnd |
        re.item(start, itemEnd) and
        result.getEnd() = itemEnd
      )
      or
      i > 0 and
      result.getRegex() = re and
      exists(int itemStart | itemStart = seqChildEnd(re, start, end, i - 1) |
        result.getStart() = itemStart and
        re.item(itemStart, result.getEnd())
      )
    )
  }
  /**
   * An alternative term, that is, a term of the form `a|b`.
   *
@@ -1030,4 +1034,62 @@ module Impl {
    override string getPrimaryQLClass() { result = "RegExpBackRef" }
  }
  class Top = RegExpParent;
  /**
   * Holds if `term` is an escape class representing e.g. `\d`.
   * `clazz` is which character class it represents, e.g. "d" for `\d`.
   */
  predicate isEscapeClass(RegExpTerm term, string clazz) {
    exists(RegExpCharacterClassEscape escape | term = escape | escape.getValue() = clazz)
  }
  /**
   * Holds if `term` is a possessive quantifier.
   * As python's regexes do not support possessive quantifiers, this never holds, but is used by the shared library.
   */
  predicate isPossessive(RegExpQuantifier term) { none() }
  /**
   * Holds if the regex that `term` is part of is used in a way that ignores any leading prefix of the input it's matched against.
   * Not yet implemented for Python.
   */
  predicate matchesAnyPrefix(RegExpTerm term) { any() }
  /**
   * Holds if the regex that `term` is part of is used in a way that ignores any trailing suffix of the input it's matched against.
   * Not yet implemented for Python.
   */
  predicate matchesAnySuffix(RegExpTerm term) { any() }
  /**
   * Holds if the regular expression should not be considered.
   *
   * We make the pragmatic performance optimization to ignore regular expressions in files
   * that does not belong to the project code (such as installed dependencies).
   */
  predicate isExcluded(RegExpParent parent) {
    not exists(parent.getRegex().getLocation().getFile().getRelativePath())
    or
    // Regexes with many occurrences of ".*" may cause the polynomial ReDoS computation to explode, so
    // we explicitly exclude these.
    count(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10
  }
  /**
   * Holds if `root` has the `i` flag for case-insensitive matching.
   */
  predicate isIgnoreCase(RegExpTerm root) {
    root.isRootTerm() and
    root.getLiteral().isIgnoreCase()
  }
  /**
   * Holds if `root` has the `s` flag for multi-line matching.
   */
  predicate isDotAll(RegExpTerm root) {
    root.isRootTerm() and
    root.getLiteral().isDotAll()
  }
 }