Merge pull request #19684 from github/nickrolfe/ruby-overlay-extraction

Ruby: add support for extracting overlay databases
This commit is contained in:
Nick Rolfe
2025-06-25 06:39:30 -04:00
committed by GitHub
22 changed files with 6400 additions and 34 deletions

1
Cargo.lock generated
View File

@@ -419,6 +419,7 @@ dependencies = [
"lazy_static",
"rayon",
"regex",
"serde_json",
"tracing",
"tracing-subscriber",
"tree-sitter",

View File

@@ -11,6 +11,7 @@
"/*- Diagnostic messages -*/",
"/*- Diagnostic messages: severity -*/",
"/*- Source location prefix -*/",
"/*- Database metadata -*/",
"/*- Lines of code -*/",
"/*- Configuration files with key value pairs -*/",
"/*- YAML -*/",
@@ -31,4 +32,4 @@
"/*- Python dbscheme -*/",
"/*- Empty location -*/"
]
}
}

View File

@@ -300,6 +300,7 @@ _NORMAL_DEPENDENCIES = {
"lazy_static": Label("@vendor_ts__lazy_static-1.5.0//:lazy_static"),
"rayon": Label("@vendor_ts__rayon-1.10.0//:rayon"),
"regex": Label("@vendor_ts__regex-1.11.1//:regex"),
"serde_json": Label("@vendor_ts__serde_json-1.0.140//:serde_json"),
"tracing": Label("@vendor_ts__tracing-0.1.41//:tracing"),
"tracing-subscriber": Label("@vendor_ts__tracing-subscriber-0.3.19//:tracing_subscriber"),
"tree-sitter": Label("@vendor_ts__tree-sitter-0.24.6//:tree_sitter"),

View File

@@ -36,5 +36,5 @@ pub fn run(options: Options) -> std::io::Result<()> {
},
];
generate(languages, options.dbscheme, options.library)
generate(languages, options.dbscheme, options.library, false)
}

View File

@@ -3,6 +3,7 @@ display_name: "Ruby"
version: 0.1.0
column_kind: "utf8"
legacy_qltest_extraction: true
overlay_support_version: 20250108
build_modes:
- none
github_api_languages:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,3 @@
description: Add databaseMetadata relation
compatibility: full
databaseMetadata.rel: delete

View File

@@ -17,5 +17,6 @@ rayon = "1.10.0"
regex = "1.11.1"
encoding = "0.2"
lazy_static = "1.5.0"
serde_json = "1.0.140"
codeql-extractor = { path = "../../shared/tree-sitter-extractor" }

View File

@@ -1,7 +1,10 @@
use clap::Args;
use codeql_extractor::file_paths::PathTransformer;
use lazy_static::lazy_static;
use rayon::prelude::*;
use serde_json;
use std::borrow::Cow;
use std::collections::HashSet;
use std::fs;
use std::io::BufRead;
use std::path::{Path, PathBuf};
@@ -78,6 +81,9 @@ pub fn run(options: Options) -> std::io::Result<()> {
let file_list = fs::File::open(file_paths::path_from_string(&options.file_list))?;
let overlay_changed_files: Option<HashSet<PathBuf>> = get_overlay_changed_files();
let path_transformer = file_paths::load_path_transformer()?;
let language: Language = tree_sitter_ruby::LANGUAGE.into();
let erb: Language = tree_sitter_embedded_template::LANGUAGE.into();
// Look up tree-sitter kind ids now, to avoid string comparisons when scanning ERB files.
@@ -94,7 +100,14 @@ pub fn run(options: Options) -> std::io::Result<()> {
.try_for_each(|line| {
let mut diagnostics_writer = diagnostics.logger();
let path = PathBuf::from(line).canonicalize()?;
let src_archive_file = file_paths::path_for(&src_archive_dir, &path, "");
match &overlay_changed_files {
Some(changed_files) if !changed_files.contains(&path) => {
// We are extracting an overlay and this file is not in the list of changes files, so we should skip it.
return Result::Ok(());
}
_ => {},
}
let src_archive_file = file_paths::path_for(&src_archive_dir, &path, "", path_transformer.as_ref());
let mut source = std::fs::read(&path)?;
let mut needs_conversion = false;
let code_ranges;
@@ -107,6 +120,7 @@ pub fn run(options: Options) -> std::io::Result<()> {
&erb_schema,
&mut diagnostics_writer,
&mut trap_writer,
path_transformer.as_ref(),
&path,
&source,
&[],
@@ -151,7 +165,7 @@ pub fn run(options: Options) -> std::io::Result<()> {
"character-decoding-error",
"Character decoding error",
)
.file(&file_paths::normalize_path(&path))
.file(&file_paths::normalize_and_transform_path(&path, path_transformer.as_ref()))
.message(
"Could not decode the file contents as {}: {}. The contents of the file must match the character encoding specified in the {} {}.",
&[
@@ -171,7 +185,7 @@ pub fn run(options: Options) -> std::io::Result<()> {
diagnostics_writer.write(
diagnostics_writer
.new_entry("unknown-character-encoding", "Could not process some files due to an unknown character encoding")
.file(&file_paths::normalize_path(&path))
.file(&file_paths::normalize_and_transform_path(&path, path_transformer.as_ref()))
.message(
"Unknown character encoding {} in {} {}.",
&[
@@ -194,6 +208,7 @@ pub fn run(options: Options) -> std::io::Result<()> {
&schema,
&mut diagnostics_writer,
&mut trap_writer,
path_transformer.as_ref(),
&path,
&source,
&code_ranges,
@@ -204,14 +219,26 @@ pub fn run(options: Options) -> std::io::Result<()> {
} else {
std::fs::copy(&path, &src_archive_file)?;
}
write_trap(&trap_dir, path, &trap_writer, trap_compression)
write_trap(&trap_dir, path, &trap_writer, trap_compression, path_transformer.as_ref())
})
.expect("failed to extract files");
let path = PathBuf::from("extras");
let mut trap_writer = trap::Writer::new();
extractor::populate_empty_location(&mut trap_writer);
let res = write_trap(&trap_dir, path, &trap_writer, trap_compression);
let res = write_trap(
&trap_dir,
path,
&trap_writer,
trap_compression,
path_transformer.as_ref(),
);
if let Ok(output_path) = std::env::var("CODEQL_EXTRACTOR_RUBY_OVERLAY_BASE_METADATA_OUT") {
// We're extracting an overlay base. For now, we don't have any metadata we need to store
// that would get read when extracting the overlay, but the CLI expects us to write
// *something*. An empty file will do.
std::fs::write(output_path, b"")?;
}
tracing::info!("Extraction complete");
res
}
@@ -237,8 +264,14 @@ fn write_trap(
path: PathBuf,
trap_writer: &trap::Writer,
trap_compression: trap::Compression,
path_transformer: Option<&PathTransformer>,
) -> std::io::Result<()> {
let trap_file = file_paths::path_for(trap_dir, &path, trap_compression.extension());
let trap_file = file_paths::path_for(
trap_dir,
&path,
trap_compression.extension(),
path_transformer,
);
std::fs::create_dir_all(trap_file.parent().unwrap())?;
trap_writer.write_to_file(&trap_file, trap_compression)
}
@@ -302,6 +335,39 @@ fn skip_space(content: &[u8], index: usize) -> usize {
}
index
}
/**
* If the relevant environment variable has been set by the CLI, indicating that we are extracting
* an overlay, this function reads the JSON file at the path given by its value, and returns a set
* of canonicalized paths of source files that have changed and should therefore be extracted.
*
* If the environment variable is not set (i.e. we're not extracting an overlay), or if the file
* cannot be read, this function returns `None`. In that case, all files should be extracted.
*/
fn get_overlay_changed_files() -> Option<HashSet<PathBuf>> {
let path = std::env::var("CODEQL_EXTRACTOR_RUBY_OVERLAY_CHANGES").ok()?;
let file_content = fs::read_to_string(path).ok()?;
let json_value: serde_json::Value = serde_json::from_str(&file_content).ok()?;
// The JSON file is expected to have the following structure:
// {
// "changes": [
// "relative/path/to/changed/file1.rb",
// "relative/path/to/changed/file2.rb",
// ...
// ]
// }
Some(
json_value
.get("changes")?
.as_array()?
.iter()
.filter_map(|change| change.as_str())
.filter_map(|s| PathBuf::from(s).canonicalize().ok())
.collect(),
)
}
fn scan_coding_comment(content: &[u8]) -> std::option::Option<Cow<str>> {
let mut index = 0;
// skip UTF-8 BOM marker if there is one

View File

@@ -28,5 +28,5 @@ pub fn run(options: Options) -> std::io::Result<()> {
},
];
generate(languages, options.dbscheme, options.library)
generate(languages, options.dbscheme, options.library, true)
}

View File

@@ -108,6 +108,12 @@ yaml_locations(unique int locatable: @yaml_locatable ref,
@yaml_locatable = @yaml_node | @yaml_error;
/*- Database metadata -*/
databaseMetadata(
string metadataKey: string ref,
string value: string ref
);
/*- Ruby dbscheme -*/
@ruby_underscore_arg = @ruby_assignment | @ruby_binary | @ruby_conditional | @ruby_operator_assignment | @ruby_range | @ruby_unary | @ruby_underscore_primary

View File

@@ -21521,6 +21521,42 @@
</dep>
</dependencies>
</relation>
<relation>
<name>databaseMetadata</name>
<cardinality>1</cardinality>
<columnsizes>
<e>
<k>metadataKey</k>
<v>1</v>
</e>
<e>
<k>value</k>
<v>1</v>
</e>
</columnsizes>
<dependencies>
<dep>
<src>metadataKey</src>
<trg>value</trg>
<val>
<hist>
<budget>12</budget>
<bs/>
</hist>
</val>
</dep>
<dep>
<src>value</src>
<trg>metadataKey</trg>
<val>
<hist>
<budget>12</budget>
<bs/>
</hist>
</val>
</dep>
</dependencies>
</relation>
<relation>
<name>yaml_aliases</name>
<cardinality>0</cardinality>

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,2 @@
description: Add databaseMetadata relation
compatibility: full

View File

@@ -15,7 +15,7 @@ impl Archiver {
}
fn try_archive(&self, source: &Path) -> std::io::Result<()> {
let dest = file_paths::path_for(&self.root, source, "");
let dest = file_paths::path_for(&self.root, source, "", None);
if fs::metadata(&dest).is_ok() {
return Ok(());
}

View File

@@ -212,7 +212,7 @@ impl TrapFile {
);
}
pub fn emit_file(&mut self, absolute_path: &Path) -> Label<generated::File> {
let untyped = extractor::populate_file(&mut self.writer, absolute_path);
let untyped = extractor::populate_file(&mut self.writer, absolute_path, None);
// SAFETY: populate_file emits `@file` typed labels
unsafe { Label::from_untyped(untyped) }
}
@@ -268,6 +268,7 @@ impl TrapFileProvider {
&self.trap_dir.join(category),
key.as_ref(),
self.compression.extension(),
None,
);
debug!("creating trap file {}", path.display());
let mut writer = trap::Writer::new();

View File

@@ -67,19 +67,26 @@ pub fn default_subscriber_with_level(
),
)
}
pub fn populate_file(writer: &mut trap::Writer, absolute_path: &Path) -> trap::Label {
pub fn populate_file(
writer: &mut trap::Writer,
absolute_path: &Path,
transformer: Option<&file_paths::PathTransformer>,
) -> trap::Label {
let (file_label, fresh) = writer.global_id(&trap::full_id_for_file(
&file_paths::normalize_path(absolute_path),
&file_paths::normalize_and_transform_path(absolute_path, transformer),
));
if fresh {
writer.add_tuple(
"files",
vec![
trap::Arg::Label(file_label),
trap::Arg::String(file_paths::normalize_path(absolute_path)),
trap::Arg::String(file_paths::normalize_and_transform_path(
absolute_path,
transformer,
)),
],
);
populate_parent_folders(writer, file_label, absolute_path.parent());
populate_parent_folders(writer, file_label, absolute_path.parent(), transformer);
}
file_label
}
@@ -117,6 +124,7 @@ pub fn populate_parent_folders(
writer: &mut trap::Writer,
child_label: trap::Label,
path: Option<&Path>,
transformer: Option<&file_paths::PathTransformer>,
) {
let mut path = path;
let mut child_label = child_label;
@@ -124,9 +132,9 @@ pub fn populate_parent_folders(
match path {
None => break,
Some(folder) => {
let (folder_label, fresh) = writer.global_id(&trap::full_id_for_folder(
&file_paths::normalize_path(folder),
));
let parent = folder.parent();
let folder = file_paths::normalize_and_transform_path(folder, transformer);
let (folder_label, fresh) = writer.global_id(&trap::full_id_for_folder(&folder));
writer.add_tuple(
"containerparent",
vec![
@@ -137,12 +145,9 @@ pub fn populate_parent_folders(
if fresh {
writer.add_tuple(
"folders",
vec![
trap::Arg::Label(folder_label),
trap::Arg::String(file_paths::normalize_path(folder)),
],
vec![trap::Arg::Label(folder_label), trap::Arg::String(folder)],
);
path = folder.parent();
path = parent;
child_label = folder_label;
} else {
break;
@@ -205,11 +210,12 @@ pub fn extract(
schema: &NodeTypeMap,
diagnostics_writer: &mut diagnostics::LogWriter,
trap_writer: &mut trap::Writer,
transformer: Option<&file_paths::PathTransformer>,
path: &Path,
source: &[u8],
ranges: &[Range],
) {
let path_str = file_paths::normalize_path(path);
let path_str = file_paths::normalize_and_transform_path(path, transformer);
let span = tracing::span!(
tracing::Level::TRACE,
"extract",
@@ -225,7 +231,7 @@ pub fn extract(
parser.set_included_ranges(ranges).unwrap();
let tree = parser.parse(source, None).expect("Failed to parse file");
trap_writer.comment(format!("Auto-generated TRAP file for {}", path_str));
let file_label = populate_file(trap_writer, path);
let file_label = populate_file(trap_writer, path, transformer);
let mut visitor = Visitor::new(
source,
diagnostics_writer,

View File

@@ -1,4 +1,4 @@
use crate::trap;
use crate::{file_paths, trap};
use globset::{GlobBuilder, GlobSetBuilder};
use rayon::prelude::*;
use std::fs::File;
@@ -111,6 +111,8 @@ impl Extractor {
)
};
let path_transformer = file_paths::load_path_transformer()?;
let lines: std::io::Result<Vec<String>> = file_lists
.iter()
.flat_map(|file_list| std::io::BufReader::new(file_list).lines())
@@ -122,8 +124,12 @@ impl Extractor {
.try_for_each(|line| {
let mut diagnostics_writer = diagnostics.logger();
let path = PathBuf::from(line).canonicalize()?;
let src_archive_file =
crate::file_paths::path_for(&self.source_archive_dir, &path, "");
let src_archive_file = crate::file_paths::path_for(
&self.source_archive_dir,
&path,
"",
path_transformer.as_ref(),
);
let source = std::fs::read(&path)?;
let mut trap_writer = trap::Writer::new();
@@ -152,6 +158,7 @@ impl Extractor {
&schemas[i],
&mut diagnostics_writer,
&mut trap_writer,
None,
&path,
&source,
&[],
@@ -183,7 +190,7 @@ fn write_trap(
trap_writer: &trap::Writer,
trap_compression: trap::Compression,
) -> std::io::Result<()> {
let trap_file = crate::file_paths::path_for(trap_dir, path, trap_compression.extension());
let trap_file = crate::file_paths::path_for(trap_dir, path, trap_compression.extension(), None);
std::fs::create_dir_all(trap_file.parent().unwrap())?;
trap_writer.write_to_file(&trap_file, trap_compression)
}

View File

@@ -1,8 +1,81 @@
use std::path::{Path, PathBuf};
use std::{
fs,
path::{Path, PathBuf},
};
/// Normalizes the path according the common CodeQL specification. Assumes that
/// `path` has already been canonicalized using `std::fs::canonicalize`.
pub fn normalize_path(path: &Path) -> String {
/// This represents the minimum supported path transformation that is needed to support extracting
/// overlay databases. Specifically, it represents a transformer where one path prefix is replaced
/// with a different prefix.
pub struct PathTransformer {
pub original: String,
pub replacement: String,
}
/// Normalizes the path according to the common CodeQL specification, and, applies the given path
/// transformer, if any. Assumes that `path` has already been canonicalized using
/// `std::fs::canonicalize`.
pub fn normalize_and_transform_path(path: &Path, transformer: Option<&PathTransformer>) -> String {
let path = normalize_path(path);
match transformer {
Some(transformer) => match path.strip_prefix(&transformer.original) {
Some(suffix) => format!("{}{}", transformer.replacement, suffix),
None => path,
},
None => path,
}
}
/**
* Attempts to load a path transformer.
*
* If the `CODEQL_PATH_TRANSFORMER` environment variable is not set, no transformer has been
* specified and the function returns `Ok(None)`.
*
* If the environment variable is set, the function attempts to load the transformer from the file
* at the specified path. If this is successful, it returns `Ok(Some(PathTransformer))`.
*
* If the file cannot be read, or if it does not match the minimal subset of the path-transformer
* syntax supported by this extractor, the function returns an error.
*/
pub fn load_path_transformer() -> std::io::Result<Option<PathTransformer>> {
let path = match std::env::var("CODEQL_PATH_TRANSFORMER") {
Ok(p) => p,
Err(_) => return Ok(None),
};
let file_content = fs::read_to_string(path)?;
let lines = file_content
.lines()
.map(|line| line.trim().to_owned())
.filter(|line| !line.is_empty())
.collect::<Vec<String>>();
if lines.len() != 2 {
return Err(unsupported_transformer_error());
}
let replacement = lines[0]
.strip_prefix('#')
.ok_or(unsupported_transformer_error())?;
let original = lines[1]
.strip_suffix("//")
.ok_or(unsupported_transformer_error())?;
Ok(Some(PathTransformer {
original: original.to_owned(),
replacement: replacement.to_owned(),
}))
}
fn unsupported_transformer_error() -> std::io::Error {
std::io::Error::new(
std::io::ErrorKind::InvalidData,
"This extractor only supports path transformers specifying a single path-prefix rewrite, \
with the first line starting with a # and the second line ending with //.",
)
}
/// Normalizes the path according to the common CodeQL specification. Assumes that `path` has
/// already been canonicalized using `std::fs::canonicalize`.
fn normalize_path(path: &Path) -> String {
if cfg!(windows) {
// The way Rust canonicalizes paths doesn't match the CodeQL spec, so we
// have to do a bit of work removing certain prefixes and replacing
@@ -93,7 +166,18 @@ pub fn path_from_string(path: &str) -> PathBuf {
result
}
pub fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {
pub fn path_for(
dir: &Path,
path: &Path,
ext: &str,
transformer: Option<&PathTransformer>,
) -> PathBuf {
let path = if transformer.is_some() {
let transformed = normalize_and_transform_path(path, transformer);
PathBuf::from(transformed)
} else {
path.to_path_buf()
};
let mut result = PathBuf::from(dir);
for component in path.components() {
match component {

View File

@@ -17,6 +17,7 @@ pub fn generate(
languages: Vec<language::Language>,
dbscheme_path: PathBuf,
ql_library_path: PathBuf,
add_metadata_relation: bool,
) -> std::io::Result<()> {
let dbscheme_file = File::create(dbscheme_path).map_err(|e| {
tracing::error!("Failed to create dbscheme file: {}", e);
@@ -32,6 +33,16 @@ pub fn generate(
writeln!(dbscheme_writer, include_str!("prefix.dbscheme"))?;
// Eventually all languages will have the metadata relation (for overlay support), at which
// point this could be moved to prefix.dbscheme.
if add_metadata_relation {
writeln!(dbscheme_writer, "/*- Database metadata -*/",)?;
dbscheme::write(
&mut dbscheme_writer,
&[dbscheme::Entry::Table(create_database_metadata())],
)?;
}
let mut ql_writer = LineWriter::new(File::create(ql_library_path)?);
writeln!(
ql_writer,
@@ -442,3 +453,26 @@ fn create_token_case<'a>(name: &'a str, token_kinds: Map<&'a str, usize>) -> dbs
branches,
}
}
fn create_database_metadata() -> dbscheme::Table<'static> {
dbscheme::Table {
name: "databaseMetadata",
keysets: None,
columns: vec![
dbscheme::Column {
db_type: dbscheme::DbColumnType::String,
name: "metadataKey",
unique: false,
ql_type: ql::Type::String,
ql_type_is_ref: true,
},
dbscheme::Column {
db_type: dbscheme::DbColumnType::String,
name: "value",
unique: false,
ql_type: ql::Type::String,
ql_type_is_ref: true,
},
],
}
}