Java: add draft of generated vs manual MaD metrics query

2026-04-29 18:55:14 +02:00 · 2022-12-06 22:15:19 -05:00
parent 5d43c431c0
commit b82f9b1911
7 changed files with 286 additions and 6 deletions
--- a/java/ql/lib/semmle/code/java/dataflow/internal/FlowSummaryImpl.qll
+++ b/java/ql/lib/semmle/code/java/dataflow/internal/FlowSummaryImpl.qll
@@ -241,9 +241,19 @@ module Public {
    }

    /**
-     * Holds if the summary is auto generated.
+     * Holds if the summary is auto generated and not manually generated.
     */
    predicate isAutoGenerated() { none() }
+
+    /**
+     * Holds if the summary is manually generated and not auto generated.
+     */
+    predicate isManuallyGenerated() { none() }
+
+    /**
+     * Holds if the summary is both auto generated and manually generated.
+     */
+    predicate isBothAutoAndManuallyGenerated() { none() }
  }

  /** A callable with a flow summary stating there is no flow via the callable. */
@@ -991,6 +1001,20 @@ module Private {
        not summaryElement(this, _, _, _, false)
      }

+      private predicate relevantSummaryElementManual(
+        AccessPath inSpec, AccessPath outSpec, string kind
+      ) {
+        summaryElement(this, inSpec, outSpec, kind, false) and
+        not summaryElement(this, _, _, _, true)
+      }
+
+      private predicate relevantSummaryElementBothGeneratedAndManual(
+        AccessPath inSpec, AccessPath outSpec, string kind
+      ) {
+        summaryElement(this, inSpec, outSpec, kind, true) and
+        summaryElement(this, inSpec, outSpec, kind, false)
+      }
+
      private predicate relevantSummaryElement(AccessPath inSpec, AccessPath outSpec, string kind) {
        summaryElement(this, inSpec, outSpec, kind, false)
        or
@@ -1012,6 +1036,12 @@ module Private {
      }

      override predicate isAutoGenerated() { this.relevantSummaryElementGenerated(_, _, _) }
+
+      override predicate isManuallyGenerated() { this.relevantSummaryElementManual(_, _, _) }
+
+      override predicate isBothAutoAndManuallyGenerated() {
+        this.relevantSummaryElementBothGeneratedAndManual(_, _, _)
+      }
    }

    /** Holds if component `c` of specification `spec` cannot be parsed. */
--- a/java/ql/src/Metrics/Summaries/GeneratedVsManualCoverage.ql
+++ b/java/ql/src/Metrics/Summaries/GeneratedVsManualCoverage.ql
@@ -0,0 +1,100 @@
+/**
+ * @id java/summary/generated-vs-manual-coverage
+ * @name Metrics of generated versus manual MaD coverage
+ * @description Expose metrics for the number of API endpoints covered by generated versus manual MaD models.
+ * @kind table
+ * @tags summary
+ */
+
+//import java // not needed I guess
+import semmle.code.java.dataflow.FlowSummary // for SummarizedCallable
+import utils.modelgenerator.internal.CaptureModels // for DataFlowTargetApi
+
+// ! improve QLDoc?
+/**
+ * A callable for a given library that is modeled by MaD.
+ * Specifically, this callable is the intersection of
+ * DataFlowTargetApis and SummarizedCallables.
+ */
+class MadModeledCallable extends SummarizedCallableBase {
+  // ! better name for this class?
+  MadModeledCallable() {
+    this instanceof SummarizedCallable and
+    exists(DataFlowTargetApi dataFlowTargApi |
+      this.asCallable() = dataFlowTargApi and
+      not exists(FunctionalExpr funcExpr | dataFlowTargApi = funcExpr.asMethod()) // ! remove this if DataFlowTargetApi itself is adjusted to exclude FunctionalExpr (see static-team slack thread)
+    )
+  }
+}
+
+// ! move to other file
+/**
+ * Returns the number of APIs with MaD models
+ * for a given package and provenance.
+ */
+float getNumMadModels(string package, string provenance) {
+  exists(MadModeledCallable mc |
+    package = mc.asCallable().getDeclaringType().getPackage().toString() and
+    provenance in ["generated", "manual", "both"]
+  |
+    result =
+      count(MadModeledCallable c |
+        package = c.asCallable().getDeclaringType().getPackage().toString() and
+        (
+          c.(SummarizedCallable).isAutoGenerated() and // generated and NOT manual = "auto-only"
+          provenance = "generated"
+          or
+          c.(SummarizedCallable).isManuallyGenerated() and // manual and NOT generated = "manual-only"
+          provenance = "manual"
+          or
+          c.(SummarizedCallable).isBothAutoAndManuallyGenerated() and // BOTH generated and manual = "both"
+          provenance = "both"
+        )
+      )
+  )
+}
+
+// ! move to other file
+/**
+ * Returns the number of APIs without MaD
+ * models for a given package.
+ */
+float getNumApisWithoutMadModel(string package) {
+  exists(DataFlowTargetApi dataFlowTargApi |
+    package = dataFlowTargApi.getDeclaringType().getPackage().toString() and
+    not exists(FunctionalExpr fe | dataFlowTargApi = fe.asMethod()) // remove lambdas // ! remove this if DataFlowTargetApi itself is adjusted to exclude FunctionalExpr (see static-team slack thread)
+  |
+    result =
+      count(DataFlowTargetApi d |
+        package = d.getDeclaringType().getPackage().toString() and
+        not exists(FunctionalExpr funcExpr | d = funcExpr.asMethod()) and // remove lambdas // ! remove this if DataFlowTargetApi itself is adjusted to exclude FunctionalExpr (see static-team slack thread)
+        not exists(SummarizedCallable sc | d = sc.asCallable()) // set minus with SummarizedCallables
+      )
+  )
+}
+
+// ! Note: adjust metric formulas as needed after more discussion with Yorck
+/*
+ * metric1:
+ * Proportion of manual models covered by automation: “both” / (“both” + “manual-only”)
+ * Auto-generated vs all positive manual (percentage of manual models covered by auto-generation)
+ */
+
+/*
+ * metric2:
+ * Coverage relative to total number of APIs: (“auto-only” + “both” + “manual-only”) / “all”
+ * Auto-generated vs specific pos+neg subset (top-N manual, random)
+ */
+
+from
+  string package, float generated, float manual, float both, float notModeled, float all,
+  float metric1, float metric2
+where
+  generated = getNumMadModels(package, "generated") and
+  manual = getNumMadModels(package, "manual") and
+  both = getNumMadModels(package, "both") and
+  notModeled = getNumApisWithoutMadModel(package) and // ! better name for this?, "none" is a reserved keyword :(
+  all = generated + manual + both + notModeled and
+  metric1 = (both / (both + manual)) and
+  metric2 = (generated + both + manual) / all
+select package, generated, manual, both, notModeled, all, metric1, metric2 order by package