Merge pull request #2670 from github/starcke/apply-slice-filter

Use filtering queries to do batched AI querying
This commit is contained in:
Anders Starcke Henriksen
2023-08-07 11:37:40 +02:00
committed by GitHub
5 changed files with 325 additions and 9 deletions

View File

@@ -17,6 +17,10 @@ import { redactableError } from "../common/errors";
import { interpretResultsSarif } from "../query-results";
import { join } from "path";
import { assertNever } from "../common/helpers-pure";
import { dir } from "tmp-promise";
import { writeFile, outputFile } from "fs-extra";
import { dump as dumpYaml } from "js-yaml";
import { MethodSignature } from "./external-api-usage";
type AutoModelQueryOptions = {
queryTag: string;
@@ -26,6 +30,7 @@ type AutoModelQueryOptions = {
databaseItem: DatabaseItem;
qlpack: QlPacksForLanguage;
sourceInfo: SourceInfo | undefined;
additionalPacks: string[];
extensionPacks: string[];
queryStorageDir: string;
@@ -52,6 +57,7 @@ async function runAutoModelQuery({
databaseItem,
qlpack,
sourceInfo,
additionalPacks,
extensionPacks,
queryStorageDir,
progress,
@@ -99,7 +105,7 @@ async function runAutoModelQuery({
quickEvalCountOnly: false,
},
false,
getOnDiskWorkspaceFolders(),
additionalPacks,
extensionPacks,
queryStorageDir,
undefined,
@@ -147,6 +153,7 @@ async function runAutoModelQuery({
type AutoModelQueriesOptions = {
mode: Mode;
candidateMethods: MethodSignature[];
cliServer: CodeQLCliServer;
queryRunner: QueryRunner;
databaseItem: DatabaseItem;
@@ -161,6 +168,7 @@ export type AutoModelQueriesResult = {
export async function runAutoModelQueries({
mode,
candidateMethods,
cliServer,
queryRunner,
databaseItem,
@@ -189,7 +197,13 @@ export async function runAutoModelQueries({
sourceLocationPrefix,
};
const additionalPacks = getOnDiskWorkspaceFolders();
// Generate a pack containing the candidate filters
const filterPackDir = await generateCandidateFilterPack(
databaseItem.language,
candidateMethods,
);
const additionalPacks = [...getOnDiskWorkspaceFolders(), filterPackDir];
const extensionPacks = Object.keys(
await cliServer.resolveQlpacks(additionalPacks, true),
);
@@ -208,6 +222,7 @@ export async function runAutoModelQueries({
databaseItem,
qlpack,
sourceInfo,
additionalPacks,
extensionPacks,
queryStorageDir,
progress: (update) => {
@@ -228,3 +243,59 @@ export async function runAutoModelQueries({
candidates,
};
}
/**
* generateCandidateFilterPack will create a temporary extension pack.
* This pack will contain a filter that will restrict the automodel queries
* to the specified candidate methods only.
* This is done using the `extensible` predicate "automodelCandidateFilter".
* @param language
* @param candidateMethods
* @returns
*/
export async function generateCandidateFilterPack(
language: string,
candidateMethods: MethodSignature[],
): Promise<string> {
// Pack resides in a temporary directory, to not pollute the workspace.
const packDir = (await dir({ unsafeCleanup: true })).path;
const syntheticConfigPack = {
name: "codeql/automodel-filter",
version: "0.0.0",
library: true,
extensionTargets: {
[`codeql/${language}-queries`]: "*",
},
dataExtensions: ["filter.yml"],
};
const qlpackFile = join(packDir, "codeql-pack.yml");
await outputFile(qlpackFile, dumpYaml(syntheticConfigPack), "utf8");
// The predicate has the following defintion:
// extensible predicate automodelCandidateFilter(string package, string type, string name, string signature)
const dataRows = candidateMethods.map((method) => [
method.packageName,
method.typeName,
method.methodName,
method.methodParameters,
]);
const filter = {
extensions: [
{
addsTo: {
pack: `codeql/${language}-queries`,
extensible: "automodelCandidateFilter",
},
data: dataRows,
},
],
};
const filterFile = join(packDir, "filter.yml");
await writeFile(filterFile, dumpYaml(filter), "utf8");
return packDir;
}

View File

@@ -4,6 +4,63 @@ import { AutoModelQueriesResult } from "./auto-model-codeml-queries";
import { assertNever } from "../common/helpers-pure";
import * as Sarif from "sarif";
import { gzipEncode } from "../common/zlib";
import { ExternalApiUsage, MethodSignature } from "./external-api-usage";
import { ModeledMethod } from "./modeled-method";
import { groupMethods, sortGroupNames, sortMethods } from "./shared/sorting";
// Soft limit on the number of candidates to send to the model.
// Note that the model may return fewer than this number of candidates.
const candidateLimit = 20;
/**
* Return the candidates that the model should be run on. This includes limiting the number of
* candidates to the candidate limit and filtering out anything that is already modeled and respecting
* the order in the UI.
* @param mode Whether it is application or framework mode.
* @param externalApiUsages all external API usages.
* @param modeledMethods the currently modeled methods.
* @returns list of modeled methods that are candidates for modeling.
*/
export function getCandidates(
mode: Mode,
externalApiUsages: ExternalApiUsage[],
modeledMethods: Record<string, ModeledMethod>,
): MethodSignature[] {
// Sort the same way as the UI so we send the first ones listed in the UI first
const grouped = groupMethods(externalApiUsages, mode);
const sortedGroupNames = sortGroupNames(grouped);
const sortedExternalApiUsages = sortedGroupNames.flatMap((name) =>
sortMethods(grouped[name]),
);
const candidates: MethodSignature[] = [];
for (const externalApiUsage of sortedExternalApiUsages) {
const modeledMethod: ModeledMethod = modeledMethods[
externalApiUsage.signature
] ?? {
type: "none",
};
// If we have reached the max number of candidates then stop
if (candidates.length >= candidateLimit) {
break;
}
// Anything that is modeled is not a candidate
if (modeledMethod.type !== "none") {
continue;
}
// A method that is supported is modeled outside of the model file, so it is not a candidate.
if (externalApiUsage.supported) {
continue;
}
// The rest are candidates
candidates.push(externalApiUsage);
}
return candidates;
}
/**
* Encode a SARIF log to the format expected by the server: JSON, GZIP-compressed, base64-encoded

View File

@@ -56,9 +56,10 @@ import { join } from "path";
import { pickExtensionPack } from "./extension-pack-picker";
import { getLanguageDisplayName } from "../common/query-language";
import { runAutoModelQueries } from "./auto-model-codeml-queries";
import { createAutoModelV2Request } from "./auto-model-v2";
import { createAutoModelV2Request, getCandidates } from "./auto-model-v2";
import { load as loadYaml } from "js-yaml";
import { loadDataExtensionYaml } from "./yaml";
import { extLogger } from "../common/logging/vscode";
export class DataExtensionsEditorView extends AbstractWebview<
ToDataExtensionsEditorMessage,
@@ -377,8 +378,22 @@ export class DataExtensionsEditorView extends AbstractWebview<
let predictedModeledMethods: Record<string, ModeledMethod>;
if (useLlmGenerationV2()) {
// Fetch the candidates to send to the model
const candidateMethods = getCandidates(
this.mode,
externalApiUsages,
modeledMethods,
);
// If there are no candidates, there is nothing to model and we just return
if (candidateMethods.length === 0) {
void extLogger.log("No candidates to model. Stopping.");
return;
}
const usages = await runAutoModelQueries({
mode: this.mode,
candidateMethods,
cliServer: this.cliServer,
queryRunner: this.queryRunner,
queryStorageDir: this.queryStorageDir,
@@ -418,12 +433,33 @@ export class DataExtensionsEditorView extends AbstractWebview<
filename: "auto-model.yml",
});
const modeledMethods = loadDataExtensionYaml(models);
if (!modeledMethods) {
const loadedMethods = loadDataExtensionYaml(models);
if (!loadedMethods) {
return;
}
predictedModeledMethods = modeledMethods;
// Any candidate that was part of the response is a negative result
// meaning that the canidate is not a sink for the kinds that the LLM is checking for.
// For now we model this as a sink neutral method, however this is subject
// to discussion.
for (const candidate of candidateMethods) {
if (!(candidate.signature in loadedMethods)) {
loadedMethods[candidate.signature] = {
type: "neutral",
kind: "sink",
input: "",
output: "",
provenance: "ai-generated",
signature: candidate.signature,
packageName: candidate.packageName,
typeName: candidate.typeName,
methodName: candidate.methodName,
methodParameters: candidate.methodParameters,
};
}
}
predictedModeledMethods = loadedMethods;
} else {
const usages = await getAutoModelUsages({
cliServer: this.cliServer,

View File

@@ -1,12 +1,15 @@
import {
createAutoModelV2Request,
encodeSarif,
getCandidates,
} from "../../../src/data-extensions-editor/auto-model-v2";
import { Mode } from "../../../src/data-extensions-editor/shared/mode";
import { AutomodelMode } from "../../../src/data-extensions-editor/auto-model-api-v2";
import { AutoModelQueriesResult } from "../../../src/data-extensions-editor/auto-model-codeml-queries";
import * as sarif from "sarif";
import { gzipDecode } from "../../../src/common/zlib";
import { ExternalApiUsage } from "../../../src/data-extensions-editor/external-api-usage";
import { ModeledMethod } from "../../../src/data-extensions-editor/modeled-method";
describe("createAutoModelV2Request", () => {
const createSarifLog = (queryId: string): sarif.Log => {
@@ -80,3 +83,110 @@ describe("createAutoModelV2Request", () => {
expect(parsed).toEqual(result.candidates);
});
});
describe("getCandidates", () => {
it("doesn't return methods that are already modelled", () => {
const externalApiUsages: ExternalApiUsage[] = [
{
library: "my.jar",
signature: "org.my.A#x()",
packageName: "org.my",
typeName: "A",
methodName: "x",
methodParameters: "()",
supported: false,
supportedType: "none",
usages: [],
},
];
const modeledMethods: Record<string, ModeledMethod> = {
"org.my.A#x()": {
type: "neutral",
kind: "",
input: "",
output: "",
provenance: "manual",
signature: "org.my.A#x()",
packageName: "org.my",
typeName: "A",
methodName: "x",
methodParameters: "()",
},
};
const candidates = getCandidates(
Mode.Application,
externalApiUsages,
modeledMethods,
);
expect(candidates.length).toEqual(0);
});
it("doesn't return methods that are supported from other sources", () => {
const externalApiUsages: ExternalApiUsage[] = [
{
library: "my.jar",
signature: "org.my.A#x()",
packageName: "org.my",
typeName: "A",
methodName: "x",
methodParameters: "()",
supported: true,
supportedType: "none",
usages: [],
},
];
const modeledMethods = {};
const candidates = getCandidates(
Mode.Application,
externalApiUsages,
modeledMethods,
);
expect(candidates.length).toEqual(0);
});
it("returns methods that are neither modeled nor supported from other sources", () => {
const externalApiUsages: ExternalApiUsage[] = [];
externalApiUsages.push({
library: "my.jar",
signature: "org.my.A#x()",
packageName: "org.my",
typeName: "A",
methodName: "x",
methodParameters: "()",
supported: false,
supportedType: "none",
usages: [],
});
const modeledMethods = {};
const candidates = getCandidates(
Mode.Application,
externalApiUsages,
modeledMethods,
);
expect(candidates.length).toEqual(1);
});
it("respects the limit", () => {
const externalApiUsages: ExternalApiUsage[] = [];
for (let i = 0; i < 30; i++) {
externalApiUsages.push({
library: "my.jar",
signature: `org.my.A#x${i}()`,
packageName: "org.my",
typeName: "A",
methodName: `x${i}`,
methodParameters: "()",
supported: false,
supportedType: "none",
usages: [],
});
}
const modeledMethods = {};
const candidates = getCandidates(
Mode.Application,
externalApiUsages,
modeledMethods,
);
expect(candidates.length).toEqual(20);
});
});

View File

@@ -5,13 +5,20 @@ import {
} from "../../../../src/databases/local-databases";
import { file } from "tmp-promise";
import { QueryResultType } from "../../../../src/query-server/new-messages";
import { runAutoModelQueries } from "../../../../src/data-extensions-editor/auto-model-codeml-queries";
import {
generateCandidateFilterPack,
runAutoModelQueries,
} from "../../../../src/data-extensions-editor/auto-model-codeml-queries";
import { Mode } from "../../../../src/data-extensions-editor/shared/mode";
import { mockedObject, mockedUri } from "../../utils/mocking.helpers";
import { CodeQLCliServer } from "../../../../src/codeql-cli/cli";
import { QueryRunner } from "../../../../src/query-server";
import * as queryResolver from "../../../../src/local-queries/query-resolver";
import * as standardQueries from "../../../../src/local-queries/standard-queries";
import { MethodSignature } from "../../../../src/data-extensions-editor/external-api-usage";
import { join } from "path";
import { exists, readFile } from "fs-extra";
import { load as loadYaml } from "js-yaml";
describe("runAutoModelQueries", () => {
const qlpack = {
@@ -60,6 +67,7 @@ describe("runAutoModelQueries", () => {
const options = {
mode: Mode.Application,
candidateMethods: [],
cliServer: mockedObject<CodeQLCliServer>({
resolveQlpacks: jest.fn().mockResolvedValue({
"/a/b/c/my-extension-pack": {},
@@ -140,7 +148,10 @@ describe("runAutoModelQueries", () => {
expect(result).not.toBeUndefined();
expect(options.cliServer.resolveQlpacks).toHaveBeenCalledTimes(1);
expect(options.cliServer.resolveQlpacks).toHaveBeenCalledWith([], true);
expect(options.cliServer.resolveQlpacks).toHaveBeenCalledWith(
expect.arrayContaining([expect.stringContaining("tmp")]),
true,
);
expect(resolveQueriesSpy).toHaveBeenCalledTimes(1);
expect(resolveQueriesSpy).toHaveBeenCalledWith(
options.cliServer,
@@ -165,7 +176,7 @@ describe("runAutoModelQueries", () => {
quickEvalCountOnly: false,
},
false,
[],
expect.arrayContaining([expect.stringContaining("tmp")]),
["/a/b/c/my-extension-pack"],
"/tmp/queries",
undefined,
@@ -173,3 +184,34 @@ describe("runAutoModelQueries", () => {
);
});
});
describe("generateCandidateFilterPack", () => {
it("should create a temp pack containing the candidate filters", async () => {
const candidateMethods: MethodSignature[] = [
{
signature: "org.my.A#x()",
packageName: "org.my",
typeName: "A",
methodName: "x",
methodParameters: "()",
},
];
const packDir = await generateCandidateFilterPack("java", candidateMethods);
expect(packDir).not.toBeUndefined();
const qlpackFile = join(packDir, "codeql-pack.yml");
expect(await exists(qlpackFile)).toBe(true);
const filterFile = join(packDir, "filter.yml");
expect(await exists(filterFile)).toBe(true);
// Read the contents of filterFile and parse as yaml
const yaml = await loadYaml(await readFile(filterFile, "utf8"));
const extensions = yaml.extensions;
expect(extensions).toBeInstanceOf(Array);
expect(extensions).toHaveLength(1);
const extension = extensions[0];
expect(extension.addsTo.pack).toEqual("codeql/java-queries");
expect(extension.addsTo.extensible).toEqual("automodelCandidateFilter");
expect(extension.data).toBeInstanceOf(Array);
expect(extension.data).toHaveLength(1);
expect(extension.data[0]).toEqual(["org.my", "A", "x", "()"]);
});
});