Java: Improve documentation of sampling strategy

2025-12-24 04:36:35 +01:00 · 2023-05-31 11:39:20 +00:00
parent 5a9d09c49e
commit ea5c36491b
1 changed files with 9 additions and 2 deletions
--- a/java/ql/src/Telemetry/AutomodelApplicationModeExtractNegativeExamples.ql
+++ b/java/ql/src/Telemetry/AutomodelApplicationModeExtractNegativeExamples.ql
@@ -14,7 +14,10 @@ private import AutomodelEndpointTypes
 private import AutomodelSharedUtil

 /**
- * Gets a sample of endpoints for which the given characteristic applies.
+ * Gets a sample of endpoints (of at most `limit` samples) for which the given characteristic applies.
+ *
+ * The main purpose of this helper predicate is to avoid selecting too many samples, as this may
+ * cause the SARIF file to exceed the maximum size limit.
 */
 bindingset[limit]
 Endpoint getSampleForCharacteristic(EndpointCharacteristic c, int limit) {
@@ -28,7 +31,11 @@ Endpoint getSampleForCharacteristic(EndpointCharacteristic c, int limit) {
          loc.getFile().getAbsolutePath(), loc.getStartLine(), loc.getStartColumn(),
          loc.getEndLine(), loc.getEndColumn()
      ) and
-    // we order the endpoints by location, but (to avoid bias) we select the indices semi-randomly
+    // To avoid selecting samples that are too close together (as the ranking above goes by file
+    // path first), we select `limit` evenly spaced samples from the ranked list of endpoints. By
+    // default this would always include the first sample, so we add a random-chosen prime offset
+    // to the first sample index, and reduce modulo the number of endpoints.
+    // Finally, we add 1 to the result, as ranking results in a 1-indexed relation.
    n = 1 + (([0 .. limit - 1] * (num_endpoints / limit).floor() + 46337) % num_endpoints)
  )
 }