From ea5c36491b3581e40ea56ca1a946ad93eefeced8 Mon Sep 17 00:00:00 2001 From: Taus Date: Wed, 31 May 2023 11:39:20 +0000 Subject: [PATCH] Java: Improve documentation of sampling strategy --- ...AutomodelApplicationModeExtractNegativeExamples.ql | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/java/ql/src/Telemetry/AutomodelApplicationModeExtractNegativeExamples.ql b/java/ql/src/Telemetry/AutomodelApplicationModeExtractNegativeExamples.ql index 7407be0be57..19beefad3d3 100644 --- a/java/ql/src/Telemetry/AutomodelApplicationModeExtractNegativeExamples.ql +++ b/java/ql/src/Telemetry/AutomodelApplicationModeExtractNegativeExamples.ql @@ -14,7 +14,10 @@ private import AutomodelEndpointTypes private import AutomodelSharedUtil /** - * Gets a sample of endpoints for which the given characteristic applies. + * Gets a sample of endpoints (of at most `limit` samples) for which the given characteristic applies. + * + * The main purpose of this helper predicate is to avoid selecting too many samples, as this may + * cause the SARIF file to exceed the maximum size limit. */ bindingset[limit] Endpoint getSampleForCharacteristic(EndpointCharacteristic c, int limit) { @@ -28,7 +31,11 @@ Endpoint getSampleForCharacteristic(EndpointCharacteristic c, int limit) { loc.getFile().getAbsolutePath(), loc.getStartLine(), loc.getStartColumn(), loc.getEndLine(), loc.getEndColumn() ) and - // we order the endpoints by location, but (to avoid bias) we select the indices semi-randomly + // To avoid selecting samples that are too close together (as the ranking above goes by file + // path first), we select `limit` evenly spaced samples from the ranked list of endpoints. By + // default this would always include the first sample, so we add a random-chosen prime offset + // to the first sample index, and reduce modulo the number of endpoints. + // Finally, we add 1 to the result, as ranking results in a 1-indexed relation. n = 1 + (([0 .. limit - 1] * (num_endpoints / limit).floor() + 46337) % num_endpoints) ) }