Merge pull request #12546 from hmac/extractor-shared-library

Introduce a shared extractor library
This commit is contained in:
Arthur Baars
2023-03-27 11:32:33 +02:00
committed by GitHub
41 changed files with 152 additions and 2509 deletions

View File

@@ -24,6 +24,6 @@ runs:
if: steps.cache-extractor.outputs.cache-hit != 'true'
shell: bash
run: |
cargo install cross --version 0.2.1
cargo install cross --version 0.2.5
scripts/create-extractor-pack.sh
working-directory: ruby

Binary file not shown.

View File

@@ -7,7 +7,6 @@ edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
flate2 = "1.0"
tree-sitter = "0.20"
tree-sitter-embedded-template = { git = "https://github.com/tree-sitter/tree-sitter-embedded-template.git", rev = "203f7bd3c1bbfbd98fc19add4b8fcb213c059205" }
tree-sitter-ruby = { git = "https://github.com/tree-sitter/tree-sitter-ruby.git", rev = "206c7077164372c596ffa8eaadb9435c28941364" }
@@ -15,10 +14,7 @@ clap = "3.0"
tracing = "0.1"
tracing-subscriber = { version = "0.3.3", features = ["env-filter"] }
rayon = "1.5.0"
num_cpus = "1.14.0"
regex = "1.7.1"
encoding = "0.2"
lazy_static = "1.4.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
chrono = { version = "0.4.19", features = ["serde"] }
codeql-extractor = { path = "../../shared/tree-sitter-extractor" }

View File

@@ -1,2 +1,8 @@
[target.x86_64-unknown-linux-gnu]
image = "centos/devtoolset-7-toolchain-centos7"
[build.env]
# Provide the path to the shared extractor
# Cross mounts this directory as a volume, so builds inside the docker container
# can see it.
volumes = ["__CODEQL-EXTRACTOR=../../shared/tree-sitter-extractor"]

View File

@@ -1,9 +1,8 @@
#[macro_use]
extern crate lazy_static;
extern crate num_cpus;
use clap::arg;
use encoding::{self};
use encoding;
use rayon::prelude::*;
use std::borrow::Cow;
use std::fs;
@@ -11,33 +10,7 @@ use std::io::BufRead;
use std::path::{Path, PathBuf};
use tree_sitter::{Language, Parser, Range};
use ruby_extractor::{diagnostics, extractor, file_paths, node_types, trap};
/**
* Gets the number of threads the extractor should use, by reading the
* CODEQL_THREADS environment variable and using it as described in the
* extractor spec:
*
* "If the number is positive, it indicates the number of threads that should
* be used. If the number is negative or zero, it should be added to the number
* of cores available on the machine to determine how many threads to use
* (minimum of 1). If unspecified, should be considered as set to -1."
*/
fn num_codeql_threads() -> Result<usize, String> {
let threads_str = std::env::var("CODEQL_THREADS").unwrap_or_else(|_| "-1".to_owned());
match threads_str.parse::<i32>() {
Ok(num) if num <= 0 => {
let reduction = -num as usize;
Ok(std::cmp::max(1, num_cpus::get() - reduction))
}
Ok(num) => Ok(num as usize),
Err(_) => Err(format!(
"Unable to parse CODEQL_THREADS value '{}'",
&threads_str
)),
}
}
use codeql_extractor::{diagnostics, extractor, file_paths, node_types, trap};
lazy_static! {
static ref CP_NUMBER: regex::Regex = regex::Regex::new("cp([0-9]+)").unwrap();
@@ -67,7 +40,7 @@ fn main() -> std::io::Result<()> {
.init();
let diagnostics = diagnostics::DiagnosticLoggers::new("ruby");
let mut main_thread_logger = diagnostics.logger();
let num_threads = match num_codeql_threads() {
let num_threads = match codeql_extractor::options::num_threads() {
Ok(num) => num,
Err(e) => {
main_thread_logger.write(
@@ -307,8 +280,10 @@ fn scan_erb(
}
}
}
if result.is_empty() {
let root = tree.root_node();
// Add an empty range at the end of the file
result.push(Range {
start_byte: root.end_byte(),

View File

@@ -6,8 +6,8 @@ use std::io::LineWriter;
use std::io::Write;
use std::path::PathBuf;
use ruby_extractor::generator::{dbscheme, language::Language, ql, ql_gen};
use ruby_extractor::node_types;
use codeql_extractor::generator::{dbscheme, language::Language, ql, ql_gen};
use codeql_extractor::node_types;
/// Given the name of the parent node, and its field information, returns a pair,
/// the first of which is the field's type. The second is an optional dbscheme

View File

@@ -1,364 +0,0 @@
use serde::Serialize;
use std::io::Write;
use std::path::PathBuf;
/** SARIF severity */
#[derive(Serialize)]
#[serde(rename_all = "lowercase")]
pub enum Severity {
Error,
Warning,
#[allow(unused)]
Note,
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Source {
/** An identifier under which it makes sense to group this diagnostic message. This is used to build the SARIF reporting descriptor object.*/
pub id: String,
/** Display name for the ID. This is used to build the SARIF reporting descriptor object. */
pub name: String,
#[serde(skip_serializing_if = "Option::is_none")]
/** Name of the CodeQL extractor. This is used to identify which tool component the reporting descriptor object should be nested under in SARIF.*/
pub extractor_name: Option<String>,
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Visibility {
#[serde(skip_serializing_if = "std::ops::Not::not")]
/** True if the message should be displayed on the status page (defaults to false) */
pub status_page: bool,
#[serde(skip_serializing_if = "std::ops::Not::not")]
/** True if the message should be counted in the diagnostics summary table printed by `codeql database analyze` (defaults to false) */
pub cli_summary_table: bool,
#[serde(skip_serializing_if = "std::ops::Not::not")]
/** True if the message should be sent to telemetry (defaults to false) */
pub telemetry: bool,
}
#[derive(Serialize, Clone, Default)]
#[serde(rename_all = "camelCase")]
pub struct Location {
#[serde(skip_serializing_if = "Option::is_none")]
/** Path to the affected file if appropriate, relative to the source root */
pub file: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub start_line: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub start_column: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub end_line: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub end_column: Option<usize>,
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DiagnosticMessage {
/** Unix timestamp */
pub timestamp: chrono::DateTime<chrono::Utc>,
pub source: Source,
#[serde(skip_serializing_if = "String::is_empty")]
/** GitHub flavored Markdown formatted message. Should include inline links to any help pages. */
pub markdown_message: String,
#[serde(skip_serializing_if = "String::is_empty")]
/** Plain text message. Used by components where the string processing needed to support Markdown is cumbersome. */
pub plaintext_message: String,
#[serde(skip_serializing_if = "Vec::is_empty")]
/** List of help links intended to supplement the `plaintextMessage`. */
pub help_links: Vec<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub severity: Option<Severity>,
#[serde(skip_serializing_if = "std::ops::Not::not")]
pub internal: bool,
#[serde(skip_serializing_if = "is_default_visibility")]
pub visibility: Visibility,
#[serde(skip_serializing_if = "Option::is_none")]
pub location: Option<Location>,
}
fn is_default_visibility(v: &Visibility) -> bool {
!v.cli_summary_table && !v.status_page && !v.telemetry
}
pub struct LogWriter {
extractor: String,
path: Option<PathBuf>,
inner: Option<std::io::BufWriter<std::fs::File>>,
}
impl LogWriter {
pub fn new_entry(&self, id: &str, name: &str) -> DiagnosticMessage {
DiagnosticMessage {
timestamp: chrono::Utc::now(),
source: Source {
id: format!("{}/{}", self.extractor, id),
name: name.to_owned(),
extractor_name: Some(self.extractor.to_owned()),
},
markdown_message: String::new(),
plaintext_message: String::new(),
help_links: vec![],
severity: None,
internal: false,
visibility: Visibility {
cli_summary_table: false,
status_page: false,
telemetry: false,
},
location: None,
}
}
pub fn write(&mut self, mesg: &DiagnosticMessage) {
let full_error_message = mesg.full_error_message();
match mesg.severity {
Some(Severity::Error) => tracing::error!("{}", full_error_message),
Some(Severity::Warning) => tracing::warn!("{}", full_error_message),
Some(Severity::Note) => tracing::info!("{}", full_error_message),
None => tracing::debug!("{}", full_error_message),
}
if self.inner.is_none() {
if let Some(path) = self.path.as_ref() {
match std::fs::OpenOptions::new()
.create(true)
.append(true)
.write(true)
.open(&path)
{
Err(e) => {
tracing::error!(
"Could not open log file '{}': {}",
&path.to_string_lossy(),
e
);
self.path = None;
self.inner = None
}
Ok(file) => self.inner = Some(std::io::BufWriter::new(file)),
}
}
}
if let Some(mut writer) = self.inner.as_mut() {
serde_json::to_writer(&mut writer, mesg)
.unwrap_or_else(|e| tracing::debug!("Failed to write log entry: {}", e));
&mut writer
.write_all(b"\n")
.unwrap_or_else(|e| tracing::debug!("Failed to write log entry: {}", e));
}
}
}
pub struct DiagnosticLoggers {
extractor: String,
root: Option<PathBuf>,
}
impl DiagnosticLoggers {
pub fn new(extractor: &str) -> Self {
let env_var = format!(
"CODEQL_EXTRACTOR_{}_DIAGNOSTIC_DIR",
extractor.to_ascii_uppercase()
);
let root = match std::env::var(&env_var) {
Err(e) => {
tracing::error!("{}: {}", e, &env_var);
None
}
Ok(dir) => {
if let Err(e) = std::fs::create_dir_all(&dir) {
tracing::error!("Failed to create log directory {}: {}", &dir, e);
None
} else {
Some(PathBuf::from(dir))
}
}
};
DiagnosticLoggers {
extractor: extractor.to_owned(),
root,
}
}
pub fn logger(&self) -> LogWriter {
thread_local! {
static THREAD_NUM: usize = {
static COUNT: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(0);
COUNT.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
};
}
THREAD_NUM.with(|n| LogWriter {
extractor: self.extractor.to_owned(),
inner: None,
path: self
.root
.as_ref()
.map(|root| root.to_owned().join(format!("extractor_{}.jsonl", n))),
})
}
}
fn longest_backtick_sequence_length(text: &str) -> usize {
let mut result = 0;
let mut count = 0;
for c in text.chars() {
if c == '`' {
count += 1;
} else {
if count > result {
result = count;
}
count = 0;
}
}
result
}
/// An argument of a diagnostic message format string.
/// A message argument is either a "code" snippet or a link.
pub enum MessageArg<'a> {
Code(&'a str),
Link(&'a str, &'a str),
}
impl DiagnosticMessage {
pub fn full_error_message(&self) -> String {
match &self.location {
Some(Location {
file: Some(path),
start_line: None,
..
}) => format!("{}: {}", path, self.plaintext_message),
Some(Location {
file: Some(path),
start_line: Some(line),
..
}) => format!("{}:{}: {}", path, line, self.plaintext_message),
_ => self.plaintext_message.to_owned(),
}
}
fn text(&mut self, text: &str) -> &mut Self {
self.plaintext_message = text.to_owned();
self
}
pub fn message(&mut self, text: &str, args: &[MessageArg]) -> &mut Self {
let parts = text.split("{}");
let mut plain = String::with_capacity(2 * text.len());
let mut markdown = String::with_capacity(2 * text.len());
for (i, p) in parts.enumerate() {
plain.push_str(p);
markdown.push_str(p);
match args.get(i) {
Some(MessageArg::Code(t)) => {
plain.push_str(t);
if t.len() > 0 {
let count = longest_backtick_sequence_length(t) + 1;
markdown.push_str(&"`".repeat(count));
if count > 1 {
markdown.push_str(" ");
}
markdown.push_str(t);
if count > 1 {
markdown.push_str(" ");
}
markdown.push_str(&"`".repeat(count));
}
}
Some(MessageArg::Link(text, url)) => {
plain.push_str(text);
self.help_link(url);
markdown.push_str("[");
markdown.push_str(text);
markdown.push_str("](");
markdown.push_str(url);
markdown.push_str(")");
}
None => {}
}
}
self.text(&plain);
self.markdown(&markdown);
self
}
pub fn markdown(&mut self, text: &str) -> &mut Self {
self.markdown_message = text.to_owned();
self
}
pub fn severity(&mut self, severity: Severity) -> &mut Self {
self.severity = Some(severity);
self
}
#[allow(unused)]
pub fn help_link(&mut self, link: &str) -> &mut Self {
self.help_links.push(link.to_owned());
self
}
#[allow(unused)]
pub fn internal(&mut self) -> &mut Self {
self.internal = true;
self
}
#[allow(unused)]
pub fn cli_summary_table(&mut self) -> &mut Self {
self.visibility.cli_summary_table = true;
self
}
pub fn status_page(&mut self) -> &mut Self {
self.visibility.status_page = true;
self
}
#[allow(unused)]
pub fn telemetry(&mut self) -> &mut Self {
self.visibility.telemetry = true;
self
}
pub fn file(&mut self, path: &str) -> &mut Self {
let loc = self.location.get_or_insert(Default::default());
loc.file = Some(path.to_owned());
self
}
pub fn location(
&mut self,
path: &str,
start_line: usize,
start_column: usize,
end_line: usize,
end_column: usize,
) -> &mut Self {
let loc = self.location.get_or_insert(Default::default());
loc.file = Some(path.to_owned());
loc.start_line = Some(start_line);
loc.start_column = Some(start_column);
loc.end_line = Some(end_line);
loc.end_column = Some(end_column);
self
}
}
#[test]
fn test_message() {
let mut m = DiagnosticLoggers::new("foo")
.logger()
.new_entry("id", "name");
m.message("hello: {}", &[MessageArg::Code("hello")]);
assert_eq!("hello: hello", m.plaintext_message);
assert_eq!("hello: `hello`", m.markdown_message);
let mut m = DiagnosticLoggers::new("foo")
.logger()
.new_entry("id", "name");
m.message(
"hello with backticks: {}",
&[MessageArg::Code("oh `hello`!")],
);
assert_eq!("hello with backticks: oh `hello`!", m.plaintext_message);
assert_eq!(
"hello with backticks: `` oh `hello`! ``",
m.markdown_message
);
}

View File

@@ -1,647 +0,0 @@
use crate::diagnostics;
use crate::file_paths;
use crate::node_types::{self, EntryKind, Field, NodeTypeMap, Storage, TypeName};
use crate::trap;
use std::collections::BTreeMap as Map;
use std::collections::BTreeSet as Set;
use std::fmt;
use std::path::Path;
use tree_sitter::{Language, Node, Parser, Range, Tree};
pub fn populate_file(writer: &mut trap::Writer, absolute_path: &Path) -> trap::Label {
let (file_label, fresh) = writer.global_id(&trap::full_id_for_file(
&file_paths::normalize_path(absolute_path),
));
if fresh {
writer.add_tuple(
"files",
vec![
trap::Arg::Label(file_label),
trap::Arg::String(file_paths::normalize_path(absolute_path)),
],
);
populate_parent_folders(writer, file_label, absolute_path.parent());
}
file_label
}
fn populate_empty_file(writer: &mut trap::Writer) -> trap::Label {
let (file_label, fresh) = writer.global_id("empty;sourcefile");
if fresh {
writer.add_tuple(
"files",
vec![
trap::Arg::Label(file_label),
trap::Arg::String("".to_string()),
],
);
}
file_label
}
pub fn populate_empty_location(writer: &mut trap::Writer) {
let file_label = populate_empty_file(writer);
location(writer, file_label, 0, 0, 0, 0);
}
pub fn populate_parent_folders(
writer: &mut trap::Writer,
child_label: trap::Label,
path: Option<&Path>,
) {
let mut path = path;
let mut child_label = child_label;
loop {
match path {
None => break,
Some(folder) => {
let (folder_label, fresh) = writer.global_id(&trap::full_id_for_folder(
&file_paths::normalize_path(folder),
));
writer.add_tuple(
"containerparent",
vec![
trap::Arg::Label(folder_label),
trap::Arg::Label(child_label),
],
);
if fresh {
writer.add_tuple(
"folders",
vec![
trap::Arg::Label(folder_label),
trap::Arg::String(file_paths::normalize_path(folder)),
],
);
path = folder.parent();
child_label = folder_label;
} else {
break;
}
}
}
}
}
fn location(
writer: &mut trap::Writer,
file_label: trap::Label,
start_line: usize,
start_column: usize,
end_line: usize,
end_column: usize,
) -> trap::Label {
let (loc_label, fresh) = writer.global_id(&format!(
"loc,{{{}}},{},{},{},{}",
file_label, start_line, start_column, end_line, end_column
));
if fresh {
writer.add_tuple(
"locations_default",
vec![
trap::Arg::Label(loc_label),
trap::Arg::Label(file_label),
trap::Arg::Int(start_line),
trap::Arg::Int(start_column),
trap::Arg::Int(end_line),
trap::Arg::Int(end_column),
],
);
}
loc_label
}
/// Extracts the source file at `path`, which is assumed to be canonicalized.
pub fn extract(
language: Language,
language_prefix: &str,
schema: &NodeTypeMap,
diagnostics_writer: &mut diagnostics::LogWriter,
trap_writer: &mut trap::Writer,
path: &Path,
source: &[u8],
ranges: &[Range],
) {
let path_str = file_paths::normalize_path(&path);
let span = tracing::span!(
tracing::Level::TRACE,
"extract",
file = %path_str
);
let _enter = span.enter();
tracing::info!("extracting: {}", path_str);
let mut parser = Parser::new();
parser.set_language(language).unwrap();
parser.set_included_ranges(ranges).unwrap();
let tree = parser.parse(&source, None).expect("Failed to parse file");
trap_writer.comment(format!("Auto-generated TRAP file for {}", path_str));
let file_label = populate_file(trap_writer, path);
let mut visitor = Visitor::new(
source,
diagnostics_writer,
trap_writer,
// TODO: should we handle path strings that are not valid UTF8 better?
&path_str,
file_label,
language_prefix,
schema,
);
traverse(&tree, &mut visitor);
parser.reset();
}
struct ChildNode {
field_name: Option<&'static str>,
label: trap::Label,
type_name: TypeName,
}
struct Visitor<'a> {
/// The file path of the source code (as string)
path: &'a str,
/// The label to use whenever we need to refer to the `@file` entity of this
/// source file.
file_label: trap::Label,
/// The source code as a UTF-8 byte array
source: &'a [u8],
/// A diagnostics::LogWriter to write diagnostic messages
diagnostics_writer: &'a mut diagnostics::LogWriter,
/// A trap::Writer to accumulate trap entries
trap_writer: &'a mut trap::Writer,
/// A counter for top-level child nodes
toplevel_child_counter: usize,
/// Language-specific name of the AST info table
ast_node_info_table_name: String,
/// Language-specific name of the tokeninfo table
tokeninfo_table_name: String,
/// A lookup table from type name to node types
schema: &'a NodeTypeMap,
/// A stack for gathering information from child nodes. Whenever a node is
/// entered the parent's [Label], child counter, and an empty list is pushed.
/// All children append their data to the list. When the visitor leaves a
/// node the list containing the child data is popped from the stack and
/// matched against the dbscheme for the node. If the expectations are met
/// the corresponding row definitions are added to the trap_output.
stack: Vec<(trap::Label, usize, Vec<ChildNode>)>,
}
impl<'a> Visitor<'a> {
fn new(
source: &'a [u8],
diagnostics_writer: &'a mut diagnostics::LogWriter,
trap_writer: &'a mut trap::Writer,
path: &'a str,
file_label: trap::Label,
language_prefix: &str,
schema: &'a NodeTypeMap,
) -> Visitor<'a> {
Visitor {
path,
file_label,
source,
diagnostics_writer,
trap_writer,
toplevel_child_counter: 0,
ast_node_info_table_name: format!("{}_ast_node_info", language_prefix),
tokeninfo_table_name: format!("{}_tokeninfo", language_prefix),
schema,
stack: Vec::new(),
}
}
fn record_parse_error(&mut self, loc: trap::Label, mesg: &diagnostics::DiagnosticMessage) {
self.diagnostics_writer.write(mesg);
let id = self.trap_writer.fresh_id();
let full_error_message = mesg.full_error_message();
let severity_code = match mesg.severity {
Some(diagnostics::Severity::Error) => 40,
Some(diagnostics::Severity::Warning) => 30,
Some(diagnostics::Severity::Note) => 20,
None => 10,
};
self.trap_writer.add_tuple(
"diagnostics",
vec![
trap::Arg::Label(id),
trap::Arg::Int(severity_code),
trap::Arg::String("parse_error".to_string()),
trap::Arg::String(mesg.plaintext_message.to_owned()),
trap::Arg::String(full_error_message),
trap::Arg::Label(loc),
],
);
}
fn record_parse_error_for_node(
&mut self,
message: &str,
args: &[diagnostics::MessageArg],
node: Node,
status_page: bool,
) {
let (start_line, start_column, end_line, end_column) = location_for(self, node);
let loc = location(
self.trap_writer,
self.file_label,
start_line,
start_column,
end_line,
end_column,
);
let mut mesg = self.diagnostics_writer.new_entry(
"parse-error",
"Could not process some files due to syntax errors",
);
&mesg
.severity(diagnostics::Severity::Warning)
.location(self.path, start_line, start_column, end_line, end_column)
.message(message, args);
if status_page {
&mesg.status_page();
}
self.record_parse_error(loc, &mesg);
}
fn enter_node(&mut self, node: Node) -> bool {
if node.is_missing() {
self.record_parse_error_for_node(
"A parse error occurred (expected {} symbol). Check the syntax of the file. If the file is invalid, correct the error or {} the file from analysis.",
&[diagnostics::MessageArg::Code(node.kind()), diagnostics::MessageArg::Link("exclude", "https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/customizing-code-scanning")],
node,
true,
);
return false;
}
if node.is_error() {
self.record_parse_error_for_node(
"A parse error occurred. Check the syntax of the file. If the file is invalid, correct the error or {} the file from analysis.",
&[diagnostics::MessageArg::Link("exclude", "https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/customizing-code-scanning")],
node,
true,
);
return false;
};
let id = self.trap_writer.fresh_id();
self.stack.push((id, 0, Vec::new()));
true
}
fn leave_node(&mut self, field_name: Option<&'static str>, node: Node) {
if node.is_error() || node.is_missing() {
return;
}
let (id, _, child_nodes) = self.stack.pop().expect("Vistor: empty stack");
let (start_line, start_column, end_line, end_column) = location_for(self, node);
let loc = location(
self.trap_writer,
self.file_label,
start_line,
start_column,
end_line,
end_column,
);
let table = self
.schema
.get(&TypeName {
kind: node.kind().to_owned(),
named: node.is_named(),
})
.unwrap();
let mut valid = true;
let (parent_id, parent_index) = match self.stack.last_mut() {
Some(p) if !node.is_extra() => {
p.1 += 1;
(p.0, p.1 - 1)
}
_ => {
self.toplevel_child_counter += 1;
(self.file_label, self.toplevel_child_counter - 1)
}
};
match &table.kind {
EntryKind::Token { kind_id, .. } => {
self.trap_writer.add_tuple(
&self.ast_node_info_table_name,
vec![
trap::Arg::Label(id),
trap::Arg::Label(parent_id),
trap::Arg::Int(parent_index),
trap::Arg::Label(loc),
],
);
self.trap_writer.add_tuple(
&self.tokeninfo_table_name,
vec![
trap::Arg::Label(id),
trap::Arg::Int(*kind_id),
sliced_source_arg(self.source, node),
],
);
}
EntryKind::Table {
fields,
name: table_name,
} => {
if let Some(args) = self.complex_node(&node, fields, &child_nodes, id) {
self.trap_writer.add_tuple(
&self.ast_node_info_table_name,
vec![
trap::Arg::Label(id),
trap::Arg::Label(parent_id),
trap::Arg::Int(parent_index),
trap::Arg::Label(loc),
],
);
let mut all_args = vec![trap::Arg::Label(id)];
all_args.extend(args);
self.trap_writer.add_tuple(table_name, all_args);
}
}
_ => {
self.record_parse_error(
loc,
self.diagnostics_writer
.new_entry(
"parse-error",
"Could not process some files due to syntax errors",
)
.severity(diagnostics::Severity::Warning)
.location(self.path, start_line, start_column, end_line, end_column)
.message(
"Unknown table type: {}",
&[diagnostics::MessageArg::Code(node.kind())],
),
);
valid = false;
}
}
if valid && !node.is_extra() {
// Extra nodes are independent root nodes and do not belong to the parent node
// Therefore we should not register them in the parent vector
if let Some(parent) = self.stack.last_mut() {
parent.2.push(ChildNode {
field_name,
label: id,
type_name: TypeName {
kind: node.kind().to_owned(),
named: node.is_named(),
},
});
};
}
}
fn complex_node(
&mut self,
node: &Node,
fields: &[Field],
child_nodes: &[ChildNode],
parent_id: trap::Label,
) -> Option<Vec<trap::Arg>> {
let mut map: Map<&Option<String>, (&Field, Vec<trap::Arg>)> = Map::new();
for field in fields {
map.insert(&field.name, (field, Vec::new()));
}
for child_node in child_nodes {
if let Some((field, values)) = map.get_mut(&child_node.field_name.map(|x| x.to_owned()))
{
//TODO: handle error and missing nodes
if self.type_matches(&child_node.type_name, &field.type_info) {
if let node_types::FieldTypeInfo::ReservedWordInt(int_mapping) =
&field.type_info
{
// We can safely unwrap because type_matches checks the key is in the map.
let (int_value, _) = int_mapping.get(&child_node.type_name.kind).unwrap();
values.push(trap::Arg::Int(*int_value));
} else {
values.push(trap::Arg::Label(child_node.label));
}
} else if field.name.is_some() {
self.record_parse_error_for_node(
"Type mismatch for field {}::{} with type {} != {}",
&[
diagnostics::MessageArg::Code(node.kind()),
diagnostics::MessageArg::Code(child_node.field_name.unwrap_or("child")),
diagnostics::MessageArg::Code(&format!("{:?}", child_node.type_name)),
diagnostics::MessageArg::Code(&format!("{:?}", field.type_info)),
],
*node,
false,
);
}
} else if child_node.field_name.is_some() || child_node.type_name.named {
self.record_parse_error_for_node(
"Value for unknown field: {}::{} and type {}",
&[
diagnostics::MessageArg::Code(node.kind()),
diagnostics::MessageArg::Code(&child_node.field_name.unwrap_or("child")),
diagnostics::MessageArg::Code(&format!("{:?}", child_node.type_name)),
],
*node,
false,
);
}
}
let mut args = Vec::new();
let mut is_valid = true;
for field in fields {
let child_values = &map.get(&field.name).unwrap().1;
match &field.storage {
Storage::Column { name: column_name } => {
if child_values.len() == 1 {
args.push(child_values.first().unwrap().clone());
} else {
is_valid = false;
let error_message = format!(
"{} for field: {}::{}",
if child_values.is_empty() {
"Missing value"
} else {
"Too many values"
},
node.kind(),
column_name
);
self.record_parse_error_for_node(&error_message, &[], *node, false);
}
}
Storage::Table {
name: table_name,
has_index,
column_name: _,
} => {
for (index, child_value) in child_values.iter().enumerate() {
if !*has_index && index > 0 {
self.record_parse_error_for_node(
"Too many values for field: {}::{}",
&[
diagnostics::MessageArg::Code(node.kind()),
diagnostics::MessageArg::Code(table_name),
],
*node,
false,
);
break;
}
let mut args = vec![trap::Arg::Label(parent_id)];
if *has_index {
args.push(trap::Arg::Int(index))
}
args.push(child_value.clone());
self.trap_writer.add_tuple(table_name, args);
}
}
}
}
if is_valid {
Some(args)
} else {
None
}
}
fn type_matches(&self, tp: &TypeName, type_info: &node_types::FieldTypeInfo) -> bool {
match type_info {
node_types::FieldTypeInfo::Single(single_type) => {
if tp == single_type {
return true;
}
if let EntryKind::Union { members } = &self.schema.get(single_type).unwrap().kind {
if self.type_matches_set(tp, members) {
return true;
}
}
}
node_types::FieldTypeInfo::Multiple { types, .. } => {
return self.type_matches_set(tp, types);
}
node_types::FieldTypeInfo::ReservedWordInt(int_mapping) => {
return !tp.named && int_mapping.contains_key(&tp.kind)
}
}
false
}
fn type_matches_set(&self, tp: &TypeName, types: &Set<TypeName>) -> bool {
if types.contains(tp) {
return true;
}
for other in types.iter() {
if let EntryKind::Union { members } = &self.schema.get(other).unwrap().kind {
if self.type_matches_set(tp, members) {
return true;
}
}
}
false
}
}
// Emit a slice of a source file as an Arg.
fn sliced_source_arg(source: &[u8], n: Node) -> trap::Arg {
let range = n.byte_range();
trap::Arg::String(String::from_utf8_lossy(&source[range.start..range.end]).into_owned())
}
// Emit a pair of `TrapEntry`s for the provided node, appropriately calibrated.
// The first is the location and label definition, and the second is the
// 'Located' entry.
fn location_for(visitor: &mut Visitor, n: Node) -> (usize, usize, usize, usize) {
// Tree-sitter row, column values are 0-based while CodeQL starts
// counting at 1. In addition Tree-sitter's row and column for the
// end position are exclusive while CodeQL's end positions are inclusive.
// This means that all values should be incremented by 1 and in addition the
// end position needs to be shift 1 to the left. In most cases this means
// simply incrementing all values except the end column except in cases where
// the end column is 0 (start of a line). In such cases the end position must be
// set to the end of the previous line.
let start_line = n.start_position().row + 1;
let start_col = n.start_position().column + 1;
let mut end_line = n.end_position().row + 1;
let mut end_col = n.end_position().column;
if start_line > end_line || start_line == end_line && start_col > end_col {
// the range is empty, clip it to sensible values
end_line = start_line;
end_col = start_col - 1;
} else if end_col == 0 {
let source = visitor.source;
// end_col = 0 means that we are at the start of a line
// unfortunately 0 is invalid as column number, therefore
// we should update the end location to be the end of the
// previous line
let mut index = n.end_byte();
if index > 0 && index <= source.len() {
index -= 1;
if source[index] != b'\n' {
visitor.diagnostics_writer.write(
visitor
.diagnostics_writer
.new_entry("internal-error", "Internal error")
.message("Expecting a line break symbol, but none found while correcting end column value", &[])
.severity(diagnostics::Severity::Error),
);
}
end_line -= 1;
end_col = 1;
while index > 0 && source[index - 1] != b'\n' {
index -= 1;
end_col += 1;
}
} else {
visitor.diagnostics_writer.write(
visitor
.diagnostics_writer
.new_entry("internal-error", "Internal error")
.message(
"Cannot correct end column value: end_byte index {} is not in range [1,{}].",
&[
diagnostics::MessageArg::Code(&index.to_string()),
diagnostics::MessageArg::Code(&source.len().to_string()),
],
)
.severity(diagnostics::Severity::Error),
);
}
}
(start_line, start_col, end_line, end_col)
}
fn traverse(tree: &Tree, visitor: &mut Visitor) {
let cursor = &mut tree.walk();
visitor.enter_node(cursor.node());
let mut recurse = true;
loop {
if recurse && cursor.goto_first_child() {
recurse = visitor.enter_node(cursor.node());
} else {
visitor.leave_node(cursor.field_name(), cursor.node());
if cursor.goto_next_sibling() {
recurse = visitor.enter_node(cursor.node());
} else if cursor.goto_parent() {
recurse = false;
} else {
break;
}
}
}
}
// Numeric indices.
#[derive(Debug, Copy, Clone)]
struct Index(usize);
impl fmt::Display for Index {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.0)
}
}

View File

@@ -1,135 +0,0 @@
use std::path::{Path, PathBuf};
/// Normalizes the path according the common CodeQL specification. Assumes that
/// `path` has already been canonicalized using `std::fs::canonicalize`.
pub fn normalize_path(path: &Path) -> String {
if cfg!(windows) {
// The way Rust canonicalizes paths doesn't match the CodeQL spec, so we
// have to do a bit of work removing certain prefixes and replacing
// backslashes.
let mut components: Vec<String> = Vec::new();
for component in path.components() {
match component {
std::path::Component::Prefix(prefix) => match prefix.kind() {
std::path::Prefix::Disk(letter) | std::path::Prefix::VerbatimDisk(letter) => {
components.push(format!("{}:", letter as char));
}
std::path::Prefix::Verbatim(x) | std::path::Prefix::DeviceNS(x) => {
components.push(x.to_string_lossy().to_string());
}
std::path::Prefix::UNC(server, share)
| std::path::Prefix::VerbatimUNC(server, share) => {
components.push(server.to_string_lossy().to_string());
components.push(share.to_string_lossy().to_string());
}
},
std::path::Component::Normal(n) => {
components.push(n.to_string_lossy().to_string());
}
std::path::Component::RootDir => {}
std::path::Component::CurDir => {}
std::path::Component::ParentDir => {}
}
}
components.join("/")
} else {
// For other operating systems, we can use the canonicalized path
// without modifications.
format!("{}", path.display())
}
}
/// Convert a user-supplied path to an absolute path, and convert it to a verbatim path on Windows.
pub fn path_from_string(path: &str) -> PathBuf {
let mut path = PathBuf::from(path);
// make path absolute
if path.is_relative() {
path = std::env::current_dir().unwrap().join(path)
};
let mut components = path.components();
// make Windows paths verbatim (with `\\?\` prefixes) which allow for extended-length paths.
let mut result = match components.next() {
None => unreachable!("empty path"),
Some(component) => match component {
std::path::Component::Prefix(prefix) => match prefix.kind() {
std::path::Prefix::Disk(drive) => {
let root = format!(r"\\?\{}:\", drive as char);
PathBuf::from(root)
}
std::path::Prefix::UNC(server, share) => {
let mut root = std::ffi::OsString::from(r"\\?\UNC\");
root.push(server);
root.push(r"\");
root.push(share);
PathBuf::from(root)
}
std::path::Prefix::Verbatim(_)
| std::path::Prefix::VerbatimUNC(_, _)
| std::path::Prefix::VerbatimDisk(_)
| std::path::Prefix::DeviceNS(_) => Path::new(&component).to_path_buf(),
},
_ => Path::new(&component).to_path_buf(),
},
};
// remove `.` and `..` components
for component in components {
match component {
std::path::Component::CurDir => continue,
std::path::Component::ParentDir => {
result.pop();
}
_ => result.push(component),
}
}
result
}
pub fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {
let mut result = PathBuf::from(dir);
for component in path.components() {
match component {
std::path::Component::Prefix(prefix) => match prefix.kind() {
std::path::Prefix::Disk(letter) | std::path::Prefix::VerbatimDisk(letter) => {
result.push(format!("{}_", letter as char))
}
std::path::Prefix::Verbatim(x) | std::path::Prefix::DeviceNS(x) => {
result.push(x);
}
std::path::Prefix::UNC(server, share)
| std::path::Prefix::VerbatimUNC(server, share) => {
result.push("unc");
result.push(server);
result.push(share);
}
},
std::path::Component::RootDir => {
// skip
}
std::path::Component::Normal(_) => {
result.push(component);
}
std::path::Component::CurDir => {
// skip
}
std::path::Component::ParentDir => {
result.pop();
}
}
}
if !ext.is_empty() {
match result.extension() {
Some(x) => {
let mut new_ext = x.to_os_string();
new_ext.push(".");
new_ext.push(ext);
result.set_extension(new_ext);
}
None => {
result.set_extension(ext);
}
}
}
result
}

View File

@@ -1,132 +0,0 @@
use std::collections::BTreeSet as Set;
use std::fmt;
use crate::generator::ql;
/// Represents a distinct entry in the database schema.
pub enum Entry<'a> {
/// An entry defining a database table.
Table(Table<'a>),
/// An entry defining a database table.
Case(Case<'a>),
/// An entry defining type that is a union of other types.
Union(Union<'a>),
}
/// A table in the database schema.
pub struct Table<'a> {
pub name: &'a str,
pub columns: Vec<Column<'a>>,
pub keysets: Option<Vec<&'a str>>,
}
/// A union in the database schema.
pub struct Union<'a> {
pub name: &'a str,
pub members: Set<&'a str>,
}
/// A table in the database schema.
pub struct Case<'a> {
pub name: &'a str,
pub column: &'a str,
pub branches: Vec<(usize, &'a str)>,
}
/// A column in a table.
pub struct Column<'a> {
pub db_type: DbColumnType,
pub name: &'a str,
pub unique: bool,
pub ql_type: ql::Type<'a>,
pub ql_type_is_ref: bool,
}
/// The database column type.
pub enum DbColumnType {
Int,
String,
}
impl<'a> fmt::Display for Case<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "case @{}.{} of", &self.name, &self.column)?;
let mut sep = " ";
for (c, tp) in &self.branches {
writeln!(f, "{} {} = @{}", sep, c, tp)?;
sep = "|";
}
writeln!(f, ";")
}
}
impl<'a> fmt::Display for Table<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if let Some(keyset) = &self.keysets {
write!(f, "#keyset[")?;
for (key_index, key) in keyset.iter().enumerate() {
if key_index > 0 {
write!(f, ", ")?;
}
write!(f, "{}", key)?;
}
writeln!(f, "]")?;
}
writeln!(f, "{}(", self.name)?;
for (column_index, column) in self.columns.iter().enumerate() {
write!(f, " ")?;
if column.unique {
write!(f, "unique ")?;
}
write!(
f,
"{} ",
match column.db_type {
DbColumnType::Int => "int",
DbColumnType::String => "string",
}
)?;
write!(f, "{}: {}", column.name, column.ql_type)?;
if column.ql_type_is_ref {
write!(f, " ref")?;
}
if column_index + 1 != self.columns.len() {
write!(f, ",")?;
}
writeln!(f)?;
}
write!(f, ");")?;
Ok(())
}
}
impl<'a> fmt::Display for Union<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "@{} = ", self.name)?;
let mut first = true;
for member in &self.members {
if first {
first = false;
} else {
write!(f, " | ")?;
}
write!(f, "@{}", member)?;
}
Ok(())
}
}
/// Generates the dbscheme by writing the given dbscheme `entries` to the `file`.
pub fn write<'a>(file: &mut dyn std::io::Write, entries: &'a [Entry]) -> std::io::Result<()> {
for entry in entries {
match entry {
Entry::Case(case) => write!(file, "{}\n\n", case)?,
Entry::Table(table) => write!(file, "{}\n\n", table)?,
Entry::Union(union) => write!(file, "{}\n\n", union)?,
}
}
Ok(())
}

View File

@@ -1,4 +0,0 @@
pub struct Language {
pub name: String,
pub node_types: &'static str,
}

View File

@@ -1,4 +0,0 @@
pub mod dbscheme;
pub mod language;
pub mod ql;
pub mod ql_gen;

View File

@@ -1,295 +0,0 @@
use std::collections::BTreeSet;
use std::fmt;
#[derive(Clone, Eq, PartialEq, Hash)]
pub enum TopLevel<'a> {
Class(Class<'a>),
Import(Import<'a>),
Module(Module<'a>),
}
impl<'a> fmt::Display for TopLevel<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
TopLevel::Import(imp) => write!(f, "{}", imp),
TopLevel::Class(cls) => write!(f, "{}", cls),
TopLevel::Module(m) => write!(f, "{}", m),
}
}
}
#[derive(Clone, Eq, PartialEq, Hash)]
pub struct Import<'a> {
pub module: &'a str,
pub alias: Option<&'a str>,
}
impl<'a> fmt::Display for Import<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "import {}", &self.module)?;
if let Some(name) = &self.alias {
write!(f, " as {}", name)?;
}
Ok(())
}
}
#[derive(Clone, Eq, PartialEq, Hash)]
pub struct Class<'a> {
pub qldoc: Option<String>,
pub name: &'a str,
pub is_abstract: bool,
pub supertypes: BTreeSet<Type<'a>>,
pub characteristic_predicate: Option<Expression<'a>>,
pub predicates: Vec<Predicate<'a>>,
}
impl<'a> fmt::Display for Class<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if let Some(qldoc) = &self.qldoc {
write!(f, "/** {} */", qldoc)?;
}
if self.is_abstract {
write!(f, "abstract ")?;
}
write!(f, "class {} extends ", &self.name)?;
for (index, supertype) in self.supertypes.iter().enumerate() {
if index > 0 {
write!(f, ", ")?;
}
write!(f, "{}", supertype)?;
}
writeln!(f, " {{ ")?;
if let Some(charpred) = &self.characteristic_predicate {
writeln!(
f,
" {}",
Predicate {
qldoc: None,
name: self.name,
overridden: false,
is_final: false,
return_type: None,
formal_parameters: vec![],
body: charpred.clone(),
}
)?;
}
for predicate in &self.predicates {
writeln!(f, " {}", predicate)?;
}
write!(f, "}}")?;
Ok(())
}
}
#[derive(Clone, Eq, PartialEq, Hash)]
pub struct Module<'a> {
pub qldoc: Option<String>,
pub name: &'a str,
pub body: Vec<TopLevel<'a>>,
}
impl<'a> fmt::Display for Module<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if let Some(qldoc) = &self.qldoc {
write!(f, "/** {} */", qldoc)?;
}
writeln!(f, "module {} {{ ", self.name)?;
for decl in &self.body {
writeln!(f, " {}", decl)?;
}
write!(f, "}}")?;
Ok(())
}
}
// The QL type of a column.
#[derive(Clone, Eq, PartialEq, Hash, Ord, PartialOrd)]
pub enum Type<'a> {
/// Primitive `int` type.
Int,
/// Primitive `string` type.
String,
/// A database type that will need to be referred to with an `@` prefix.
At(&'a str),
/// A user-defined type.
Normal(&'a str),
}
impl<'a> fmt::Display for Type<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Type::Int => write!(f, "int"),
Type::String => write!(f, "string"),
Type::Normal(name) => write!(f, "{}", name),
Type::At(name) => write!(f, "@{}", name),
}
}
}
#[derive(Clone, Eq, PartialEq, Hash)]
pub enum Expression<'a> {
Var(&'a str),
String(&'a str),
Integer(usize),
Pred(&'a str, Vec<Expression<'a>>),
And(Vec<Expression<'a>>),
Or(Vec<Expression<'a>>),
Equals(Box<Expression<'a>>, Box<Expression<'a>>),
Dot(Box<Expression<'a>>, &'a str, Vec<Expression<'a>>),
Aggregate {
name: &'a str,
vars: Vec<FormalParameter<'a>>,
range: Option<Box<Expression<'a>>>,
expr: Box<Expression<'a>>,
second_expr: Option<Box<Expression<'a>>>,
},
}
impl<'a> fmt::Display for Expression<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Expression::Var(x) => write!(f, "{}", x),
Expression::String(s) => write!(f, "\"{}\"", s),
Expression::Integer(n) => write!(f, "{}", n),
Expression::Pred(n, args) => {
write!(f, "{}(", n)?;
for (index, arg) in args.iter().enumerate() {
if index > 0 {
write!(f, ", ")?;
}
write!(f, "{}", arg)?;
}
write!(f, ")")
}
Expression::And(conjuncts) => {
if conjuncts.is_empty() {
write!(f, "any()")
} else {
for (index, conjunct) in conjuncts.iter().enumerate() {
if index > 0 {
write!(f, " and ")?;
}
write!(f, "({})", conjunct)?;
}
Ok(())
}
}
Expression::Or(disjuncts) => {
if disjuncts.is_empty() {
write!(f, "none()")
} else {
for (index, disjunct) in disjuncts.iter().enumerate() {
if index > 0 {
write!(f, " or ")?;
}
write!(f, "({})", disjunct)?;
}
Ok(())
}
}
Expression::Equals(a, b) => write!(f, "{} = {}", a, b),
Expression::Dot(x, member_pred, args) => {
write!(f, "{}.{}(", x, member_pred)?;
for (index, arg) in args.iter().enumerate() {
if index > 0 {
write!(f, ", ")?;
}
write!(f, "{}", arg)?;
}
write!(f, ")")
}
Expression::Aggregate {
name,
vars,
range,
expr,
second_expr,
} => {
write!(f, "{}(", name)?;
if !vars.is_empty() {
for (index, var) in vars.iter().enumerate() {
if index > 0 {
write!(f, ", ")?;
}
write!(f, "{}", var)?;
}
write!(f, " | ")?;
}
if let Some(range) = range {
write!(f, "{} | ", range)?;
}
write!(f, "{}", expr)?;
if let Some(second_expr) = second_expr {
write!(f, ", {}", second_expr)?;
}
write!(f, ")")
}
}
}
}
#[derive(Clone, Eq, PartialEq, Hash)]
pub struct Predicate<'a> {
pub qldoc: Option<String>,
pub name: &'a str,
pub overridden: bool,
pub is_final: bool,
pub return_type: Option<Type<'a>>,
pub formal_parameters: Vec<FormalParameter<'a>>,
pub body: Expression<'a>,
}
impl<'a> fmt::Display for Predicate<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if let Some(qldoc) = &self.qldoc {
write!(f, "/** {} */", qldoc)?;
}
if self.is_final {
write!(f, "final ")?;
}
if self.overridden {
write!(f, "override ")?;
}
match &self.return_type {
None => write!(f, "predicate ")?,
Some(return_type) => write!(f, "{} ", return_type)?,
}
write!(f, "{}(", self.name)?;
for (index, param) in self.formal_parameters.iter().enumerate() {
if index > 0 {
write!(f, ", ")?;
}
write!(f, "{}", param)?;
}
write!(f, ") {{ {} }}", self.body)?;
Ok(())
}
}
#[derive(Clone, Eq, PartialEq, Hash)]
pub struct FormalParameter<'a> {
pub name: &'a str,
pub param_type: Type<'a>,
}
impl<'a> fmt::Display for FormalParameter<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{} {}", self.param_type, self.name)
}
}
/// Generates a QL library by writing the given `elements` to the `file`.
pub fn write<'a>(file: &mut dyn std::io::Write, elements: &'a [TopLevel]) -> std::io::Result<()> {
for element in elements {
write!(file, "{}\n\n", &element)?;
}
Ok(())
}

View File

@@ -1,566 +0,0 @@
use std::collections::BTreeSet;
use crate::{generator::ql, node_types};
/// Creates the hard-coded `AstNode` class that acts as a supertype of all
/// classes we generate.
pub fn create_ast_node_class<'a>(ast_node: &'a str, node_info_table: &'a str) -> ql::Class<'a> {
// Default implementation of `toString` calls `this.getAPrimaryQlClass()`
let to_string = ql::Predicate {
qldoc: Some(String::from(
"Gets a string representation of this element.",
)),
name: "toString",
overridden: false,
is_final: false,
return_type: Some(ql::Type::String),
formal_parameters: vec![],
body: ql::Expression::Equals(
Box::new(ql::Expression::Var("result")),
Box::new(ql::Expression::Dot(
Box::new(ql::Expression::Var("this")),
"getAPrimaryQlClass",
vec![],
)),
),
};
let get_location = ql::Predicate {
name: "getLocation",
qldoc: Some(String::from("Gets the location of this element.")),
overridden: false,
is_final: true,
return_type: Some(ql::Type::Normal("L::Location")),
formal_parameters: vec![],
body: ql::Expression::Pred(
node_info_table,
vec![
ql::Expression::Var("this"),
ql::Expression::Var("_"), // parent
ql::Expression::Var("_"), // parent index
ql::Expression::Var("result"), // location
],
),
};
let get_a_field_or_child = create_none_predicate(
Some(String::from("Gets a field or child node of this node.")),
"getAFieldOrChild",
false,
Some(ql::Type::Normal("AstNode")),
);
let get_parent = ql::Predicate {
qldoc: Some(String::from("Gets the parent of this element.")),
name: "getParent",
overridden: false,
is_final: true,
return_type: Some(ql::Type::Normal("AstNode")),
formal_parameters: vec![],
body: ql::Expression::Pred(
node_info_table,
vec![
ql::Expression::Var("this"),
ql::Expression::Var("result"),
ql::Expression::Var("_"), // parent index
ql::Expression::Var("_"), // location
],
),
};
let get_parent_index = ql::Predicate {
qldoc: Some(String::from(
"Gets the index of this node among the children of its parent.",
)),
name: "getParentIndex",
overridden: false,
is_final: true,
return_type: Some(ql::Type::Int),
formal_parameters: vec![],
body: ql::Expression::Pred(
node_info_table,
vec![
ql::Expression::Var("this"),
ql::Expression::Var("_"), // parent
ql::Expression::Var("result"), // parent index
ql::Expression::Var("_"), // location
],
),
};
let get_a_primary_ql_class = ql::Predicate {
qldoc: Some(String::from(
"Gets the name of the primary QL class for this element.",
)),
name: "getAPrimaryQlClass",
overridden: false,
is_final: false,
return_type: Some(ql::Type::String),
formal_parameters: vec![],
body: ql::Expression::Equals(
Box::new(ql::Expression::Var("result")),
Box::new(ql::Expression::String("???")),
),
};
let get_primary_ql_classes = ql::Predicate {
qldoc: Some(
"Gets a comma-separated list of the names of the primary CodeQL \
classes to which this element belongs."
.to_owned(),
),
name: "getPrimaryQlClasses",
overridden: false,
is_final: false,
return_type: Some(ql::Type::String),
formal_parameters: vec![],
body: ql::Expression::Equals(
Box::new(ql::Expression::Var("result")),
Box::new(ql::Expression::Aggregate {
name: "concat",
vars: vec![],
range: None,
expr: Box::new(ql::Expression::Dot(
Box::new(ql::Expression::Var("this")),
"getAPrimaryQlClass",
vec![],
)),
second_expr: Some(Box::new(ql::Expression::String(","))),
}),
),
};
ql::Class {
qldoc: Some(String::from("The base class for all AST nodes")),
name: "AstNode",
is_abstract: false,
supertypes: vec![ql::Type::At(ast_node)].into_iter().collect(),
characteristic_predicate: None,
predicates: vec![
to_string,
get_location,
get_parent,
get_parent_index,
get_a_field_or_child,
get_a_primary_ql_class,
get_primary_ql_classes,
],
}
}
pub fn create_token_class<'a>(token_type: &'a str, tokeninfo: &'a str) -> ql::Class<'a> {
let tokeninfo_arity = 3; // id, kind, value
let get_value = ql::Predicate {
qldoc: Some(String::from("Gets the value of this token.")),
name: "getValue",
overridden: false,
is_final: true,
return_type: Some(ql::Type::String),
formal_parameters: vec![],
body: create_get_field_expr_for_column_storage("result", tokeninfo, 1, tokeninfo_arity),
};
let to_string = ql::Predicate {
qldoc: Some(String::from(
"Gets a string representation of this element.",
)),
name: "toString",
overridden: true,
is_final: true,
return_type: Some(ql::Type::String),
formal_parameters: vec![],
body: ql::Expression::Equals(
Box::new(ql::Expression::Var("result")),
Box::new(ql::Expression::Dot(
Box::new(ql::Expression::Var("this")),
"getValue",
vec![],
)),
),
};
ql::Class {
qldoc: Some(String::from("A token.")),
name: "Token",
is_abstract: false,
supertypes: vec![ql::Type::At(token_type), ql::Type::Normal("AstNode")]
.into_iter()
.collect(),
characteristic_predicate: None,
predicates: vec![
get_value,
to_string,
create_get_a_primary_ql_class("Token", false),
],
}
}
// Creates the `ReservedWord` class.
pub fn create_reserved_word_class(db_name: &str) -> ql::Class {
let class_name = "ReservedWord";
let get_a_primary_ql_class = create_get_a_primary_ql_class(class_name, true);
ql::Class {
qldoc: Some(String::from("A reserved word.")),
name: class_name,
is_abstract: false,
supertypes: vec![ql::Type::At(db_name), ql::Type::Normal("Token")]
.into_iter()
.collect(),
characteristic_predicate: None,
predicates: vec![get_a_primary_ql_class],
}
}
/// Creates a predicate whose body is `none()`.
fn create_none_predicate<'a>(
qldoc: Option<String>,
name: &'a str,
overridden: bool,
return_type: Option<ql::Type<'a>>,
) -> ql::Predicate<'a> {
ql::Predicate {
qldoc,
name,
overridden,
is_final: false,
return_type,
formal_parameters: Vec::new(),
body: ql::Expression::Pred("none", vec![]),
}
}
/// Creates an overridden `getAPrimaryQlClass` predicate that returns the given
/// name.
fn create_get_a_primary_ql_class(class_name: &str, is_final: bool) -> ql::Predicate {
ql::Predicate {
qldoc: Some(String::from(
"Gets the name of the primary QL class for this element.",
)),
name: "getAPrimaryQlClass",
overridden: true,
is_final,
return_type: Some(ql::Type::String),
formal_parameters: vec![],
body: ql::Expression::Equals(
Box::new(ql::Expression::Var("result")),
Box::new(ql::Expression::String(class_name)),
),
}
}
/// Returns an expression to get a field that's defined as a column in the parent's table.
///
/// # Arguments
///
/// * `result_var_name` - the name of the variable to which the resulting value should be bound
/// * `table_name` - the name of parent's defining table
/// * `column_index` - the index in that table that defines the field
/// * `arity` - the total number of columns in the table
fn create_get_field_expr_for_column_storage<'a>(
result_var_name: &'a str,
table_name: &'a str,
column_index: usize,
arity: usize,
) -> ql::Expression<'a> {
let num_underscores_before = column_index;
let num_underscores_after = arity - 2 - num_underscores_before;
ql::Expression::Pred(
table_name,
[
vec![ql::Expression::Var("this")],
vec![ql::Expression::Var("_"); num_underscores_before],
vec![ql::Expression::Var(result_var_name)],
vec![ql::Expression::Var("_"); num_underscores_after],
]
.concat(),
)
}
/// Returns an expression to get the field with the given index from its
/// auxiliary table. The index name can be "_" so the expression will hold for
/// all indices.
fn create_get_field_expr_for_table_storage<'a>(
result_var_name: &'a str,
table_name: &'a str,
index_var_name: Option<&'a str>,
) -> ql::Expression<'a> {
ql::Expression::Pred(
table_name,
match index_var_name {
Some(index_var_name) => vec![
ql::Expression::Var("this"),
ql::Expression::Var(index_var_name),
ql::Expression::Var(result_var_name),
],
None => vec![ql::Expression::Var("this"), ql::Expression::Var("result")],
},
)
}
/// Creates a pair consisting of a predicate to get the given field, and an
/// optional expression that will get the same field. When the field can occur
/// multiple times, the predicate will take an index argument, while the
/// expression will use the "don't care" expression to hold for all occurrences.
///
/// # Arguments
///
/// `main_table_name` - the name of the defining table for the parent node
/// `main_table_arity` - the number of columns in the main table
/// `main_table_column_index` - a mutable reference to a column index indicating
/// where the field is in the main table. If this is used (i.e. the field has
/// column storage), then the index is incremented.
/// `parent_name` - the name of the parent node
/// `field` - the field whose getters we are creating
/// `field_type` - the db name of the field's type (possibly being a union we created)
fn create_field_getters<'a>(
main_table_name: &'a str,
main_table_arity: usize,
main_table_column_index: &mut usize,
field: &'a node_types::Field,
nodes: &'a node_types::NodeTypeMap,
) -> (ql::Predicate<'a>, Option<ql::Expression<'a>>) {
let return_type = match &field.type_info {
node_types::FieldTypeInfo::Single(t) => {
Some(ql::Type::Normal(&nodes.get(t).unwrap().ql_class_name))
}
node_types::FieldTypeInfo::Multiple {
types: _,
dbscheme_union: _,
ql_class,
} => Some(ql::Type::Normal(ql_class)),
node_types::FieldTypeInfo::ReservedWordInt(_) => Some(ql::Type::String),
};
let formal_parameters = match &field.storage {
node_types::Storage::Column { .. } => vec![],
node_types::Storage::Table { has_index, .. } => {
if *has_index {
vec![ql::FormalParameter {
name: "i",
param_type: ql::Type::Int,
}]
} else {
vec![]
}
}
};
// For the expression to get a value, what variable name should the result
// be bound to?
let get_value_result_var_name = match &field.type_info {
node_types::FieldTypeInfo::ReservedWordInt(_) => "value",
node_types::FieldTypeInfo::Single(_) => "result",
node_types::FieldTypeInfo::Multiple { .. } => "result",
};
// Two expressions for getting the value. One that's suitable use in the
// getter predicate (where there may be a specific index), and another for
// use in `getAFieldOrChild` (where we use a "don't care" expression to
// match any index).
let (get_value, get_value_any_index) = match &field.storage {
node_types::Storage::Column { name: _ } => {
let column_index = *main_table_column_index;
*main_table_column_index += 1;
(
create_get_field_expr_for_column_storage(
get_value_result_var_name,
main_table_name,
column_index,
main_table_arity,
),
create_get_field_expr_for_column_storage(
get_value_result_var_name,
main_table_name,
column_index,
main_table_arity,
),
)
}
node_types::Storage::Table {
name: field_table_name,
has_index,
column_name: _,
} => (
create_get_field_expr_for_table_storage(
get_value_result_var_name,
field_table_name,
if *has_index { Some("i") } else { None },
),
create_get_field_expr_for_table_storage(
get_value_result_var_name,
field_table_name,
if *has_index { Some("_") } else { None },
),
),
};
let (body, optional_expr) = match &field.type_info {
node_types::FieldTypeInfo::ReservedWordInt(int_mapping) => {
// Create an expression that binds the corresponding string to `result` for each `value`, e.g.:
// result = "foo" and value = 0 or
// result = "bar" and value = 1 or
// result = "baz" and value = 2
let disjuncts = int_mapping
.iter()
.map(|(token_str, (value, _))| {
ql::Expression::And(vec![
ql::Expression::Equals(
Box::new(ql::Expression::Var("result")),
Box::new(ql::Expression::String(token_str)),
),
ql::Expression::Equals(
Box::new(ql::Expression::Var("value")),
Box::new(ql::Expression::Integer(*value)),
),
])
})
.collect();
(
ql::Expression::Aggregate {
name: "exists",
vars: vec![ql::FormalParameter {
name: "value",
param_type: ql::Type::Int,
}],
range: Some(Box::new(get_value)),
expr: Box::new(ql::Expression::Or(disjuncts)),
second_expr: None,
},
// Since the getter returns a string and not an AstNode, it won't be part of getAFieldOrChild:
None,
)
}
node_types::FieldTypeInfo::Single(_) | node_types::FieldTypeInfo::Multiple { .. } => {
(get_value, Some(get_value_any_index))
}
};
let qldoc = match &field.name {
Some(name) => format!("Gets the node corresponding to the field `{}`.", name),
None => {
if formal_parameters.is_empty() {
"Gets the child of this node.".to_owned()
} else {
"Gets the `i`th child of this node.".to_owned()
}
}
};
(
ql::Predicate {
qldoc: Some(qldoc),
name: &field.getter_name,
overridden: false,
is_final: true,
return_type,
formal_parameters,
body,
},
optional_expr,
)
}
/// Converts the given node types into CodeQL classes wrapping the dbscheme.
pub fn convert_nodes(nodes: &node_types::NodeTypeMap) -> Vec<ql::TopLevel> {
let mut classes: Vec<ql::TopLevel> = Vec::new();
let mut token_kinds = BTreeSet::new();
for (type_name, node) in nodes {
if let node_types::EntryKind::Token { .. } = &node.kind {
if type_name.named {
token_kinds.insert(&type_name.kind);
}
}
}
for (type_name, node) in nodes {
match &node.kind {
node_types::EntryKind::Token { kind_id: _ } => {
if type_name.named {
let get_a_primary_ql_class =
create_get_a_primary_ql_class(&node.ql_class_name, true);
let mut supertypes: BTreeSet<ql::Type> = BTreeSet::new();
supertypes.insert(ql::Type::At(&node.dbscheme_name));
supertypes.insert(ql::Type::Normal("Token"));
classes.push(ql::TopLevel::Class(ql::Class {
qldoc: Some(format!("A class representing `{}` tokens.", type_name.kind)),
name: &node.ql_class_name,
is_abstract: false,
supertypes,
characteristic_predicate: None,
predicates: vec![get_a_primary_ql_class],
}));
}
}
node_types::EntryKind::Union { members: _ } => {
// It's a tree-sitter supertype node, so we're wrapping a dbscheme
// union type.
classes.push(ql::TopLevel::Class(ql::Class {
qldoc: None,
name: &node.ql_class_name,
is_abstract: false,
supertypes: vec![
ql::Type::At(&node.dbscheme_name),
ql::Type::Normal("AstNode"),
]
.into_iter()
.collect(),
characteristic_predicate: None,
predicates: vec![],
}));
}
node_types::EntryKind::Table {
name: main_table_name,
fields,
} => {
if fields.is_empty() {
panic!("Encountered node '{}' with no fields", type_name.kind);
}
// Count how many columns there will be in the main table. There
// will be one for the id, plus one for each field that's stored
// as a column.
let main_table_arity = 1 + fields
.iter()
.filter(|&f| matches!(f.storage, node_types::Storage::Column { .. }))
.count();
let main_class_name = &node.ql_class_name;
let mut main_class = ql::Class {
qldoc: Some(format!("A class representing `{}` nodes.", type_name.kind)),
name: main_class_name,
is_abstract: false,
supertypes: vec![
ql::Type::At(&node.dbscheme_name),
ql::Type::Normal("AstNode"),
]
.into_iter()
.collect(),
characteristic_predicate: None,
predicates: vec![create_get_a_primary_ql_class(main_class_name, true)],
};
let mut main_table_column_index: usize = 0;
let mut get_child_exprs: Vec<ql::Expression> = Vec::new();
// Iterate through the fields, creating:
// - classes to wrap union types if fields need them,
// - predicates to access the fields,
// - the QL expressions to access the fields that will be part of getAFieldOrChild.
for field in fields {
let (get_pred, get_child_expr) = create_field_getters(
main_table_name,
main_table_arity,
&mut main_table_column_index,
field,
nodes,
);
main_class.predicates.push(get_pred);
if let Some(get_child_expr) = get_child_expr {
get_child_exprs.push(get_child_expr)
}
}
main_class.predicates.push(ql::Predicate {
qldoc: Some(String::from("Gets a field or child node of this node.")),
name: "getAFieldOrChild",
overridden: true,
is_final: true,
return_type: Some(ql::Type::Normal("AstNode")),
formal_parameters: vec![],
body: ql::Expression::Or(get_child_exprs),
});
classes.push(ql::TopLevel::Class(main_class));
}
}
}
classes
}

View File

@@ -1,6 +0,0 @@
pub mod diagnostics;
pub mod extractor;
pub mod file_paths;
pub mod generator;
pub mod node_types;
pub mod trap;

View File

@@ -1,449 +0,0 @@
use serde::Deserialize;
use std::collections::BTreeMap;
use std::path::Path;
use std::collections::BTreeSet as Set;
use std::fs;
/// A lookup table from TypeName to Entry.
pub type NodeTypeMap = BTreeMap<TypeName, Entry>;
#[derive(Debug)]
pub struct Entry {
pub dbscheme_name: String,
pub ql_class_name: String,
pub kind: EntryKind,
}
#[derive(Debug)]
pub enum EntryKind {
Union { members: Set<TypeName> },
Table { name: String, fields: Vec<Field> },
Token { kind_id: usize },
}
#[derive(Debug, Ord, PartialOrd, Eq, PartialEq)]
pub struct TypeName {
pub kind: String,
pub named: bool,
}
#[derive(Debug)]
pub enum FieldTypeInfo {
/// The field has a single type.
Single(TypeName),
/// The field can take one of several types, so we also provide the name of
/// the database union type that wraps them, and the corresponding QL class
/// name.
Multiple {
types: Set<TypeName>,
dbscheme_union: String,
ql_class: String,
},
/// The field can be one of several tokens, so the db type will be an `int`
/// with a `case @foo.kind` for each possibility.
ReservedWordInt(BTreeMap<String, (usize, String)>),
}
#[derive(Debug)]
pub struct Field {
pub parent: TypeName,
pub type_info: FieldTypeInfo,
/// The name of the field or None for the anonymous 'children'
/// entry from node_types.json
pub name: Option<String>,
/// The name of the predicate to get this field.
pub getter_name: String,
pub storage: Storage,
}
fn name_for_field_or_child(name: &Option<String>) -> String {
match name {
Some(name) => name.clone(),
None => "child".to_owned(),
}
}
#[derive(Debug)]
pub enum Storage {
/// the field is stored as a column in the parent table
Column { name: String },
/// the field is stored in a link table
Table {
/// the name of the table
name: String,
/// the name of the column for the field in the dbscheme
column_name: String,
/// does it have an associated index column?
has_index: bool,
},
}
impl Storage {
pub fn is_column(&self) -> bool {
match self {
Storage::Column { .. } => true,
_ => false,
}
}
}
pub fn read_node_types(prefix: &str, node_types_path: &Path) -> std::io::Result<NodeTypeMap> {
let file = fs::File::open(node_types_path)?;
let node_types: Vec<NodeInfo> = serde_json::from_reader(file)?;
Ok(convert_nodes(prefix, &node_types))
}
pub fn read_node_types_str(prefix: &str, node_types_json: &str) -> std::io::Result<NodeTypeMap> {
let node_types: Vec<NodeInfo> = serde_json::from_str(node_types_json)?;
Ok(convert_nodes(prefix, &node_types))
}
fn convert_type(node_type: &NodeType) -> TypeName {
TypeName {
kind: node_type.kind.to_string(),
named: node_type.named,
}
}
fn convert_types(node_types: &[NodeType]) -> Set<TypeName> {
node_types.iter().map(convert_type).collect()
}
pub fn convert_nodes(prefix: &str, nodes: &[NodeInfo]) -> NodeTypeMap {
let mut entries = NodeTypeMap::new();
let mut token_kinds = Set::new();
// First, find all the token kinds
for node in nodes {
if node.subtypes.is_none()
&& node.fields.as_ref().map_or(0, |x| x.len()) == 0
&& node.children.is_none()
{
let type_name = TypeName {
kind: node.kind.clone(),
named: node.named,
};
token_kinds.insert(type_name);
}
}
for node in nodes {
let flattened_name = &node_type_name(&node.kind, node.named);
let dbscheme_name = escape_name(flattened_name);
let ql_class_name = dbscheme_name_to_class_name(&dbscheme_name);
let dbscheme_name = format!("{}_{}", prefix, &dbscheme_name);
if let Some(subtypes) = &node.subtypes {
// It's a tree-sitter supertype node, for which we create a union
// type.
entries.insert(
TypeName {
kind: node.kind.clone(),
named: node.named,
},
Entry {
dbscheme_name,
ql_class_name,
kind: EntryKind::Union {
members: convert_types(subtypes),
},
},
);
} else if node.fields.as_ref().map_or(0, |x| x.len()) == 0 && node.children.is_none() {
// Token kind, handled above.
} else {
// It's a product type, defined by a table.
let type_name = TypeName {
kind: node.kind.clone(),
named: node.named,
};
let table_name = escape_name(&(format!("{}_def", &flattened_name)));
let table_name = format!("{}_{}", prefix, &table_name);
let mut fields = Vec::new();
// If the type also has fields or children, then we create either
// auxiliary tables or columns in the defining table for them.
if let Some(node_fields) = &node.fields {
for (field_name, field_info) in node_fields {
add_field(
prefix,
&type_name,
Some(field_name.to_string()),
field_info,
&mut fields,
&token_kinds,
);
}
}
if let Some(children) = &node.children {
// Treat children as if they were a field called 'child'.
add_field(
prefix,
&type_name,
None,
children,
&mut fields,
&token_kinds,
);
}
entries.insert(
type_name,
Entry {
dbscheme_name,
ql_class_name,
kind: EntryKind::Table {
name: table_name,
fields,
},
},
);
}
}
let mut counter = 0;
for type_name in token_kinds {
let entry = if type_name.named {
counter += 1;
let unprefixed_name = node_type_name(&type_name.kind, true);
Entry {
dbscheme_name: escape_name(&format!("{}_token_{}", &prefix, &unprefixed_name)),
ql_class_name: dbscheme_name_to_class_name(&escape_name(&unprefixed_name)),
kind: EntryKind::Token { kind_id: counter },
}
} else {
Entry {
dbscheme_name: format!("{}_reserved_word", &prefix),
ql_class_name: "ReservedWord".to_owned(),
kind: EntryKind::Token { kind_id: 0 },
}
};
entries.insert(type_name, entry);
}
entries
}
fn add_field(
prefix: &str,
parent_type_name: &TypeName,
field_name: Option<String>,
field_info: &FieldInfo,
fields: &mut Vec<Field>,
token_kinds: &Set<TypeName>,
) {
let parent_flattened_name = node_type_name(&parent_type_name.kind, parent_type_name.named);
let column_name = escape_name(&name_for_field_or_child(&field_name));
let storage = if !field_info.multiple && field_info.required {
// This field must appear exactly once, so we add it as
// a column to the main table for the node type.
Storage::Column { name: column_name }
} else {
// Put the field in an auxiliary table.
let has_index = field_info.multiple;
let field_table_name = escape_name(&format!(
"{}_{}_{}",
&prefix,
parent_flattened_name,
&name_for_field_or_child(&field_name)
));
Storage::Table {
has_index,
name: field_table_name,
column_name,
}
};
let converted_types = convert_types(&field_info.types);
let type_info = if storage.is_column()
&& field_info
.types
.iter()
.all(|t| !t.named && token_kinds.contains(&convert_type(t)))
{
// All possible types for this field are reserved words. The db
// representation will be an `int` with a `case @foo.field = ...` to
// enumerate the possible values.
let mut field_token_ints: BTreeMap<String, (usize, String)> = BTreeMap::new();
for (counter, t) in converted_types.into_iter().enumerate() {
let dbscheme_variant_name =
escape_name(&format!("{}_{}_{}", &prefix, parent_flattened_name, t.kind));
field_token_ints.insert(t.kind.to_owned(), (counter, dbscheme_variant_name));
}
FieldTypeInfo::ReservedWordInt(field_token_ints)
} else if field_info.types.len() == 1 {
FieldTypeInfo::Single(converted_types.into_iter().next().unwrap())
} else {
// The dbscheme type for this field will be a union. In QL, it'll just be AstNode.
FieldTypeInfo::Multiple {
types: converted_types,
dbscheme_union: format!(
"{}_{}_{}_type",
&prefix,
&parent_flattened_name,
&name_for_field_or_child(&field_name)
),
ql_class: "AstNode".to_owned(),
}
};
let getter_name = format!(
"get{}",
dbscheme_name_to_class_name(&escape_name(&name_for_field_or_child(&field_name)))
);
fields.push(Field {
parent: TypeName {
kind: parent_type_name.kind.to_string(),
named: parent_type_name.named,
},
type_info,
name: field_name,
getter_name,
storage,
});
}
#[derive(Deserialize)]
pub struct NodeInfo {
#[serde(rename = "type")]
pub kind: String,
pub named: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub fields: Option<BTreeMap<String, FieldInfo>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub children: Option<FieldInfo>,
#[serde(skip_serializing_if = "Option::is_none")]
pub subtypes: Option<Vec<NodeType>>,
}
#[derive(Deserialize)]
pub struct NodeType {
#[serde(rename = "type")]
pub kind: String,
pub named: bool,
}
#[derive(Deserialize)]
pub struct FieldInfo {
pub multiple: bool,
pub required: bool,
pub types: Vec<NodeType>,
}
/// Given a tree-sitter node type's (kind, named) pair, returns a single string
/// representing the (unescaped) name we'll use to refer to corresponding QL
/// type.
fn node_type_name(kind: &str, named: bool) -> String {
if named {
kind.to_string()
} else {
format!("{}_unnamed", kind)
}
}
const RESERVED_KEYWORDS: [&str; 14] = [
"boolean", "case", "date", "float", "int", "key", "of", "order", "ref", "string", "subtype",
"type", "unique", "varchar",
];
/// Returns a string that's a copy of `name` but suitably escaped to be a valid
/// QL identifier.
fn escape_name(name: &str) -> String {
let mut result = String::new();
// If there's a leading underscore, replace it with 'underscore_'.
if let Some(c) = name.chars().next() {
if c == '_' {
result.push_str("underscore");
}
}
for c in name.chars() {
match c {
'{' => result.push_str("lbrace"),
'}' => result.push_str("rbrace"),
'<' => result.push_str("langle"),
'>' => result.push_str("rangle"),
'[' => result.push_str("lbracket"),
']' => result.push_str("rbracket"),
'(' => result.push_str("lparen"),
')' => result.push_str("rparen"),
'|' => result.push_str("pipe"),
'=' => result.push_str("equal"),
'~' => result.push_str("tilde"),
'?' => result.push_str("question"),
'`' => result.push_str("backtick"),
'^' => result.push_str("caret"),
'!' => result.push_str("bang"),
'#' => result.push_str("hash"),
'%' => result.push_str("percent"),
'&' => result.push_str("ampersand"),
'.' => result.push_str("dot"),
',' => result.push_str("comma"),
'/' => result.push_str("slash"),
':' => result.push_str("colon"),
';' => result.push_str("semicolon"),
'"' => result.push_str("dquote"),
'*' => result.push_str("star"),
'+' => result.push_str("plus"),
'-' => result.push_str("minus"),
'@' => result.push_str("at"),
_ if c.is_uppercase() => {
result.push('_');
result.push_str(&c.to_lowercase().to_string())
}
_ => result.push(c),
}
}
for &keyword in &RESERVED_KEYWORDS {
if result == keyword {
result.push_str("__");
break;
}
}
result
}
pub fn to_snake_case(word: &str) -> String {
let mut prev_upper = true;
let mut result = String::new();
for c in word.chars() {
if c.is_uppercase() {
if !prev_upper {
result.push('_')
}
prev_upper = true;
result.push(c.to_ascii_lowercase());
} else {
prev_upper = false;
result.push(c);
}
}
result
}
/// Given a valid dbscheme name (i.e. in snake case), produces the equivalent QL
/// name (i.e. in CamelCase). For example, "foo_bar_baz" becomes "FooBarBaz".
fn dbscheme_name_to_class_name(dbscheme_name: &str) -> String {
fn to_title_case(word: &str) -> String {
let mut first = true;
let mut result = String::new();
for c in word.chars() {
if first {
first = false;
result.push(c.to_ascii_uppercase());
} else {
result.push(c);
}
}
result
}
dbscheme_name
.split('_')
.map(to_title_case)
.collect::<Vec<String>>()
.join("")
}
#[test]
fn to_snake_case_test() {
assert_eq!("ruby", to_snake_case("Ruby"));
assert_eq!("erb", to_snake_case("ERB"));
assert_eq!("embedded_template", to_snake_case("EmbeddedTemplate"));
}

View File

@@ -1,272 +0,0 @@
use std::borrow::Cow;
use std::fmt;
use std::io::{BufWriter, Write};
use std::path::Path;
use flate2::write::GzEncoder;
pub struct Writer {
/// The accumulated trap entries
trap_output: Vec<Entry>,
/// A counter for generating fresh labels
counter: u32,
/// cache of global keys
global_keys: std::collections::HashMap<String, Label>,
}
impl Writer {
pub fn new() -> Writer {
Writer {
counter: 0,
trap_output: Vec::new(),
global_keys: std::collections::HashMap::new(),
}
}
pub fn fresh_id(&mut self) -> Label {
let label = Label(self.counter);
self.counter += 1;
self.trap_output.push(Entry::FreshId(label));
label
}
/// Gets a label that will hold the unique ID of the passed string at import time.
/// This can be used for incrementally importable TRAP files -- use globally unique
/// strings to compute a unique ID for table tuples.
///
/// Note: You probably want to make sure that the key strings that you use are disjoint
/// for disjoint column types; the standard way of doing this is to prefix (or append)
/// the column type name to the ID. Thus, you might identify methods in Java by the
/// full ID "methods_com.method.package.DeclaringClass.method(argumentList)".
pub fn global_id(&mut self, key: &str) -> (Label, bool) {
if let Some(label) = self.global_keys.get(key) {
return (*label, false);
}
let label = Label(self.counter);
self.counter += 1;
self.global_keys.insert(key.to_owned(), label);
self.trap_output
.push(Entry::MapLabelToKey(label, key.to_owned()));
(label, true)
}
pub fn add_tuple(&mut self, table_name: &str, args: Vec<Arg>) {
self.trap_output
.push(Entry::GenericTuple(table_name.to_owned(), args))
}
pub fn comment(&mut self, text: String) {
self.trap_output.push(Entry::Comment(text));
}
pub fn write_to_file(&self, path: &Path, compression: Compression) -> std::io::Result<()> {
let trap_file = std::fs::File::create(path)?;
match compression {
Compression::None => {
let mut trap_file = BufWriter::new(trap_file);
self.write_trap_entries(&mut trap_file)
}
Compression::Gzip => {
let trap_file = GzEncoder::new(trap_file, flate2::Compression::fast());
let mut trap_file = BufWriter::new(trap_file);
self.write_trap_entries(&mut trap_file)
}
}
}
fn write_trap_entries<W: Write>(&self, file: &mut W) -> std::io::Result<()> {
for trap_entry in &self.trap_output {
writeln!(file, "{}", trap_entry)?;
}
std::io::Result::Ok(())
}
}
pub enum Entry {
/// Maps the label to a fresh id, e.g. `#123=*`.
FreshId(Label),
/// Maps the label to a key, e.g. `#7=@"foo"`.
MapLabelToKey(Label, String),
/// foo_bar(arg*)
GenericTuple(String, Vec<Arg>),
Comment(String),
}
impl fmt::Display for Entry {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Entry::FreshId(label) => write!(f, "{}=*", label),
Entry::MapLabelToKey(label, key) => {
write!(f, "{}=@\"{}\"", label, key.replace("\"", "\"\""))
}
Entry::GenericTuple(name, args) => {
write!(f, "{}(", name)?;
for (index, arg) in args.iter().enumerate() {
if index > 0 {
write!(f, ",")?;
}
write!(f, "{}", arg)?;
}
write!(f, ")")
}
Entry::Comment(line) => write!(f, "// {}", line),
}
}
}
#[derive(Debug, Copy, Clone)]
// Identifiers of the form #0, #1...
pub struct Label(u32);
impl fmt::Display for Label {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "#{:x}", self.0)
}
}
// Some untyped argument to a TrapEntry.
#[derive(Debug, Clone)]
pub enum Arg {
Label(Label),
Int(usize),
String(String),
}
const MAX_STRLEN: usize = 1048576;
impl fmt::Display for Arg {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Arg::Label(x) => write!(f, "{}", x),
Arg::Int(x) => write!(f, "{}", x),
Arg::String(x) => write!(
f,
"\"{}\"",
limit_string(x, MAX_STRLEN).replace("\"", "\"\"")
),
}
}
}
pub struct Program(Vec<Entry>);
impl fmt::Display for Program {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut text = String::new();
for trap_entry in &self.0 {
text.push_str(&format!("{}\n", trap_entry));
}
write!(f, "{}", text)
}
}
pub fn full_id_for_file(normalized_path: &str) -> String {
format!("{};sourcefile", escape_key(normalized_path))
}
pub fn full_id_for_folder(normalized_path: &str) -> String {
format!("{};folder", escape_key(normalized_path))
}
/// Escapes a string for use in a TRAP key, by replacing special characters with
/// HTML entities.
fn escape_key<'a, S: Into<Cow<'a, str>>>(key: S) -> Cow<'a, str> {
fn needs_escaping(c: char) -> bool {
matches!(c, '&' | '{' | '}' | '"' | '@' | '#')
}
let key = key.into();
if key.contains(needs_escaping) {
let mut escaped = String::with_capacity(2 * key.len());
for c in key.chars() {
match c {
'&' => escaped.push_str("&amp;"),
'{' => escaped.push_str("&lbrace;"),
'}' => escaped.push_str("&rbrace;"),
'"' => escaped.push_str("&quot;"),
'@' => escaped.push_str("&commat;"),
'#' => escaped.push_str("&num;"),
_ => escaped.push(c),
}
}
Cow::Owned(escaped)
} else {
key
}
}
/// Limit the length (in bytes) of a string. If the string's length in bytes is
/// less than or equal to the limit then the entire string is returned. Otherwise
/// the string is sliced at the provided limit. If there is a multi-byte character
/// at the limit then the returned slice will be slightly shorter than the limit to
/// avoid splitting that multi-byte character.
fn limit_string(string: &str, max_size: usize) -> &str {
if string.len() <= max_size {
return string;
}
let p = string.as_bytes();
let mut index = max_size;
// We want to clip the string at [max_size]; however, the character at that position
// may span several bytes. We need to find the first byte of the character. In UTF-8
// encoded data any byte that matches the bit pattern 10XXXXXX is not a start byte.
// Therefore we decrement the index as long as there are bytes matching this pattern.
// This ensures we cut the string at the border between one character and another.
while index > 0 && (p[index] & 0b11000000) == 0b10000000 {
index -= 1;
}
&string[0..index]
}
#[derive(Clone, Copy)]
pub enum Compression {
None,
Gzip,
}
impl Compression {
pub fn from_env(var_name: &str) -> Result<Compression, String> {
match std::env::var(var_name) {
Ok(method) => match Compression::from_string(&method) {
Some(c) => Ok(c),
None => Err(format!("Unknown compression method '{}'", &method)),
},
// Default compression method if the env var isn't set:
Err(_) => Ok(Compression::Gzip),
}
}
pub fn from_string(s: &str) -> Option<Compression> {
match s.to_lowercase().as_ref() {
"none" => Some(Compression::None),
"gzip" => Some(Compression::Gzip),
_ => None,
}
}
pub fn extension(&self) -> &str {
match self {
Compression::None => "trap",
Compression::Gzip => "trap.gz",
}
}
}
#[test]
fn limit_string_test() {
assert_eq!("hello", limit_string(&"hello world".to_owned(), 5));
assert_eq!("hi ☹", limit_string(&"hi ☹☹".to_owned(), 6));
assert_eq!("hi ", limit_string(&"hi ☹☹".to_owned(), 5));
}
#[test]
fn escape_key_test() {
assert_eq!("foo!", escape_key("foo!"));
assert_eq!("foo&lbrace;&rbrace;", escape_key("foo{}"));
assert_eq!("&lbrace;&rbrace;", escape_key("{}"));
assert_eq!("", escape_key(""));
assert_eq!("/path/to/foo.rb", escape_key("/path/to/foo.rb"));
assert_eq!(
"/path/to/foo&amp;&lbrace;&rbrace;&quot;&commat;&num;.rb",
escape_key("/path/to/foo&{}\"@#.rb")
);
}

View File

@@ -14,7 +14,15 @@ else
fi
(cd extractor && "$CARGO" build --release)
extractor/target/release/generator --dbscheme ql/lib/ruby.dbscheme --library ql/lib/codeql/ruby/ast/internal/TreeSitter.qll
# If building via cross, the binaries will be in extractor/target/<triple>/release
# If building via cargo, the binaries will be in extractor/target/release
BIN_DIR=extractor/target/release
if [[ "$CARGO" == "cross" ]]; then
BIN_DIR=extractor/target/x86_64-unknown-linux-gnu/release
fi
"$BIN_DIR/generator" --dbscheme ql/lib/ruby.dbscheme --library ql/lib/codeql/ruby/ast/internal/TreeSitter.qll
codeql query format -i ql/lib/codeql/ruby/ast/internal/TreeSitter.qll
@@ -22,5 +30,5 @@ rm -rf extractor-pack
mkdir -p extractor-pack
cp -r codeql-extractor.yml downgrades tools ql/lib/ruby.dbscheme ql/lib/ruby.dbscheme.stats extractor-pack/
mkdir -p extractor-pack/tools/${platform}
cp extractor/target/release/extractor extractor-pack/tools/${platform}/extractor
cp extractor/target/release/autobuilder extractor-pack/tools/${platform}/autobuilder
cp "$BIN_DIR/extractor" extractor-pack/tools/${platform}/extractor
cp "$BIN_DIR/autobuilder" extractor-pack/tools/${platform}/autobuilder