Merge branch 'main' into experimental-strong-params

This commit is contained in:
thiggy1342
2022-07-22 20:41:33 -04:00
committed by GitHub
28 changed files with 1298 additions and 814 deletions

View File

@@ -0,0 +1,4 @@
---
category: fix
---
* Under certain circumstances a variable declaration that is not also a definition could be associated with a `Variable` that did not have the definition as a `VariableDeclarationEntry`. This is now fixed, and a unique `Variable` will exist that has both the declaration and the definition as a `VariableDeclarationEntry`.

View File

@@ -6,6 +6,7 @@
import semmle.code.cpp.Location
private import semmle.code.cpp.Enclosing
private import semmle.code.cpp.internal.ResolveClass
private import semmle.code.cpp.internal.ResolveGlobalVariable
/**
* Get the `Element` that represents this `@element`.
@@ -28,9 +29,12 @@ Element mkElement(@element e) { unresolveElement(result) = e }
pragma[inline]
@element unresolveElement(Element e) {
not result instanceof @usertype and
not result instanceof @variable and
result = e
or
e = resolveClass(result)
or
e = resolveGlobalVariable(result)
}
/**

View File

@@ -6,6 +6,7 @@ import semmle.code.cpp.Element
import semmle.code.cpp.exprs.Access
import semmle.code.cpp.Initializer
private import semmle.code.cpp.internal.ResolveClass
private import semmle.code.cpp.internal.ResolveGlobalVariable
/**
* A C/C++ variable. For example, in the following code there are four
@@ -32,6 +33,8 @@ private import semmle.code.cpp.internal.ResolveClass
* can have multiple declarations.
*/
class Variable extends Declaration, @variable {
Variable() { isVariable(underlyingElement(this)) }
override string getAPrimaryQlClass() { result = "Variable" }
/** Gets the initializer of this variable, if any. */

View File

@@ -0,0 +1,60 @@
private predicate hasDefinition(@globalvariable g) {
exists(@var_decl vd | var_decls(vd, g, _, _, _) | var_def(vd))
}
pragma[noinline]
private predicate onlyOneCompleteGlobalVariableExistsWithMangledName(@mangledname name) {
strictcount(@globalvariable g | hasDefinition(g) and mangled_name(g, name)) = 1
}
/** Holds if `g` is a unique global variable with a definition named `name`. */
pragma[noinline]
private predicate isGlobalWithMangledNameAndWithDefinition(@mangledname name, @globalvariable g) {
hasDefinition(g) and
mangled_name(g, name) and
onlyOneCompleteGlobalVariableExistsWithMangledName(name)
}
/** Holds if `g` is a global variable without a definition named `name`. */
pragma[noinline]
private predicate isGlobalWithMangledNameAndWithoutDefinition(@mangledname name, @globalvariable g) {
not hasDefinition(g) and
mangled_name(g, name)
}
/**
* Holds if `incomplete` is a global variable without a definition, and there exists
* a unique global variable `complete` with the same name that does have a definition.
*/
private predicate hasTwinWithDefinition(@globalvariable incomplete, @globalvariable complete) {
exists(@mangledname name |
not variable_instantiation(incomplete, complete) and
isGlobalWithMangledNameAndWithoutDefinition(name, incomplete) and
isGlobalWithMangledNameAndWithDefinition(name, complete)
)
}
import Cached
cached
private module Cached {
/**
* If `v` is a global variable without a definition, and there exists a unique
* global variable with the same name that does have a definition, then the
* result is that unique global variable. Otherwise, the result is `v`.
*/
cached
@variable resolveGlobalVariable(@variable v) {
hasTwinWithDefinition(v, result)
or
not hasTwinWithDefinition(v, _) and
result = v
}
cached
predicate isVariable(@variable v) {
not v instanceof @globalvariable
or
v = resolveGlobalVariable(_)
}
}

View File

@@ -74,13 +74,12 @@ class ReturnStackAllocatedMemoryConfig extends MustFlowConfiguration {
from
MustFlowPathNode source, MustFlowPathNode sink, VariableAddressInstruction var,
ReturnStackAllocatedMemoryConfig conf, Function f
ReturnStackAllocatedMemoryConfig conf
where
conf.hasFlowPath(source, sink) and
conf.hasFlowPath(pragma[only_bind_into](source), pragma[only_bind_into](sink)) and
source.getNode().asInstruction() = var and
// Only raise an alert if we're returning from the _same_ callable as the on that
// declared the stack variable.
var.getEnclosingFunction() = pragma[only_bind_into](f) and
sink.getNode().getEnclosingCallable() = pragma[only_bind_into](f)
var.getEnclosingFunction() = sink.getNode().getEnclosingCallable()
select sink.getNode(), source, sink, "May return stack-allocated memory from $@.", var.getAst(),
var.getAst().toString()

View File

@@ -77,7 +77,7 @@ class ExecState extends DataFlow::FlowState {
ExecState() {
this =
"ExecState (" + fst.getLocation() + " | " + fst + ", " + snd.getLocation() + " | " + snd + ")" and
interestingConcatenation(fst, snd)
interestingConcatenation(pragma[only_bind_into](fst), pragma[only_bind_into](snd))
}
DataFlow::Node getFstNode() { result = fst }

View File

@@ -4,11 +4,7 @@
| c.c:6:5:6:6 | ls | array of 4 {int} | 1 |
| c.c:8:5:8:7 | iss | array of 4 {array of 2 {int}} | 1 |
| c.c:12:11:12:11 | i | typedef {int} as "int_alias" | 1 |
| c.h:4:12:4:13 | ks | array of {int} | 1 |
| c.h:8:12:8:14 | iss | array of {array of 2 {int}} | 1 |
| c.h:10:12:10:12 | i | int | 1 |
| d.cpp:3:7:3:8 | xs | array of {int} | 1 |
| d.h:3:14:3:15 | xs | array of 2 {int} | 1 |
| file://:0:0:0:0 | (unnamed parameter 0) | reference to {const {struct __va_list_tag}} | 1 |
| file://:0:0:0:0 | (unnamed parameter 0) | rvalue reference to {struct __va_list_tag} | 1 |
| file://:0:0:0:0 | fp_offset | unsigned int | 1 |

View File

@@ -2,7 +2,54 @@ function RegisterExtractorPack(id)
local extractor = GetPlatformToolsDirectory() ..
'Semmle.Extraction.CSharp.Driver'
if OperatingSystem == 'windows' then extractor = extractor .. '.exe' end
function DotnetMatcherBuild(compilerName, compilerPath, compilerArguments,
_languageId)
if compilerName ~= 'dotnet' and compilerName ~= 'dotnet.exe' then
return nil
end
-- The dotnet CLI has the following usage instructions:
-- dotnet [sdk-options] [command] [command-options] [arguments]
-- we are interested in dotnet build, which has the following usage instructions:
-- dotnet [options] build [<PROJECT | SOLUTION>...]
-- For now, parse the command line as follows:
-- Everything that starts with `-` (or `/`) will be ignored.
-- The first non-option argument is treated as the command.
-- if that's `build`, we append `/p:UseSharedCompilation=false` to the command line,
-- otherwise we do nothing.
local match = false
local argv = compilerArguments.argv
if OperatingSystem == 'windows' then
-- let's hope that this split matches the escaping rules `dotnet` applies to command line arguments
-- or, at least, that it is close enough
argv =
NativeArgumentsToArgv(compilerArguments.nativeArgumentPointer)
end
for i, arg in ipairs(argv) do
-- dotnet options start with either - or / (both are legal)
local firstCharacter = string.sub(arg, 1, 1)
if not (firstCharacter == '-') and not (firstCharacter == '/') then
Log(1, 'Dotnet subcommand detected: %s', arg)
if arg == 'build' then match = true end
break
end
end
if match then
return {
order = ORDER_REPLACE,
invocation = BuildExtractorInvocation(id, compilerPath,
compilerPath,
compilerArguments, nil, {
'/p:UseSharedCompilation=false'
})
}
end
return nil
end
local windowsMatchers = {
DotnetMatcherBuild,
CreatePatternMatcher({'^dotnet%.exe$'}, MatchCompilerName, extractor, {
prepend = {'--dotnetexec', '--cil'},
order = ORDER_BEFORE
@@ -10,22 +57,21 @@ function RegisterExtractorPack(id)
CreatePatternMatcher({'^csc.*%.exe$'}, MatchCompilerName, extractor, {
prepend = {'--compiler', '"${compiler}"', '--cil'},
order = ORDER_BEFORE
}),
CreatePatternMatcher({'^fakes.*%.exe$', 'moles.*%.exe'},
MatchCompilerName, nil, {trace = false})
}
local posixMatchers = {
CreatePatternMatcher({'^mcs%.exe$', '^csc%.exe$'}, MatchCompilerName,
extractor, {
prepend = {'--compiler', '"${compiler}"', '--cil'},
order = ORDER_BEFORE
}),
DotnetMatcherBuild,
CreatePatternMatcher({'^mono', '^dotnet$'}, MatchCompilerName,
extractor, {
prepend = {'--dotnetexec', '--cil'},
order = ORDER_BEFORE
}),
CreatePatternMatcher({'^mcs%.exe$', '^csc%.exe$'}, MatchCompilerName,
extractor, {
prepend = {'--compiler', '"${compiler}"', '--cil'},
order = ORDER_BEFORE
}), function(compilerName, compilerPath, compilerArguments, _languageId)
if MatchCompilerName('^msbuild$', compilerName, compilerPath,
compilerArguments) or
@@ -49,7 +95,6 @@ function RegisterExtractorPack(id)
else
return posixMatchers
end
end
-- Return a list of minimum supported versions of the configuration file format

View File

@@ -1,161 +1,112 @@
use crate::trap;
use node_types::{EntryKind, Field, NodeTypeMap, Storage, TypeName};
use std::borrow::Cow;
use std::collections::BTreeMap as Map;
use std::collections::BTreeSet as Set;
use std::fmt;
use std::io::Write;
use std::path::Path;
use tracing::{error, info, span, Level};
use tree_sitter::{Language, Node, Parser, Range, Tree};
pub struct TrapWriter {
/// The accumulated trap entries
trap_output: Vec<TrapEntry>,
/// A counter for generating fresh labels
counter: u32,
/// cache of global keys
global_keys: std::collections::HashMap<String, Label>,
pub fn populate_file(writer: &mut trap::Writer, absolute_path: &Path) -> trap::Label {
let (file_label, fresh) =
writer.global_id(&trap::full_id_for_file(&normalize_path(absolute_path)));
if fresh {
writer.add_tuple(
"files",
vec![
trap::Arg::Label(file_label),
trap::Arg::String(normalize_path(absolute_path)),
],
);
populate_parent_folders(writer, file_label, absolute_path.parent());
}
file_label
}
pub fn new_trap_writer() -> TrapWriter {
TrapWriter {
counter: 0,
trap_output: Vec::new(),
global_keys: std::collections::HashMap::new(),
fn populate_empty_file(writer: &mut trap::Writer) -> trap::Label {
let (file_label, fresh) = writer.global_id("empty;sourcefile");
if fresh {
writer.add_tuple(
"files",
vec![
trap::Arg::Label(file_label),
trap::Arg::String("".to_string()),
],
);
}
file_label
}
impl TrapWriter {
/// Gets a label that will hold the unique ID of the passed string at import time.
/// This can be used for incrementally importable TRAP files -- use globally unique
/// strings to compute a unique ID for table tuples.
///
/// Note: You probably want to make sure that the key strings that you use are disjoint
/// for disjoint column types; the standard way of doing this is to prefix (or append)
/// the column type name to the ID. Thus, you might identify methods in Java by the
/// full ID "methods_com.method.package.DeclaringClass.method(argumentList)".
pub fn populate_empty_location(writer: &mut trap::Writer) {
let file_label = populate_empty_file(writer);
location(writer, file_label, 0, 0, 0, 0);
}
fn fresh_id(&mut self) -> Label {
let label = Label(self.counter);
self.counter += 1;
self.trap_output.push(TrapEntry::FreshId(label));
label
}
fn global_id(&mut self, key: &str) -> (Label, bool) {
if let Some(label) = self.global_keys.get(key) {
return (*label, false);
}
let label = Label(self.counter);
self.counter += 1;
self.global_keys.insert(key.to_owned(), label);
self.trap_output
.push(TrapEntry::MapLabelToKey(label, key.to_owned()));
(label, true)
}
fn add_tuple(&mut self, table_name: &str, args: Vec<Arg>) {
self.trap_output
.push(TrapEntry::GenericTuple(table_name.to_owned(), args))
}
fn populate_file(&mut self, absolute_path: &Path) -> Label {
let (file_label, fresh) = self.global_id(&full_id_for_file(absolute_path));
if fresh {
self.add_tuple(
"files",
vec![
Arg::Label(file_label),
Arg::String(normalize_path(absolute_path)),
],
);
self.populate_parent_folders(file_label, absolute_path.parent());
}
file_label
}
fn populate_empty_file(&mut self) -> Label {
let (file_label, fresh) = self.global_id("empty;sourcefile");
if fresh {
self.add_tuple(
"files",
vec![Arg::Label(file_label), Arg::String("".to_string())],
);
}
file_label
}
pub fn populate_empty_location(&mut self) {
let file_label = self.populate_empty_file();
self.location(file_label, 0, 0, 0, 0);
}
fn populate_parent_folders(&mut self, child_label: Label, path: Option<&Path>) {
let mut path = path;
let mut child_label = child_label;
loop {
match path {
None => break,
Some(folder) => {
let (folder_label, fresh) = self.global_id(&full_id_for_folder(folder));
self.add_tuple(
"containerparent",
vec![Arg::Label(folder_label), Arg::Label(child_label)],
pub fn populate_parent_folders(
writer: &mut trap::Writer,
child_label: trap::Label,
path: Option<&Path>,
) {
let mut path = path;
let mut child_label = child_label;
loop {
match path {
None => break,
Some(folder) => {
let (folder_label, fresh) =
writer.global_id(&trap::full_id_for_folder(&normalize_path(folder)));
writer.add_tuple(
"containerparent",
vec![
trap::Arg::Label(folder_label),
trap::Arg::Label(child_label),
],
);
if fresh {
writer.add_tuple(
"folders",
vec![
trap::Arg::Label(folder_label),
trap::Arg::String(normalize_path(folder)),
],
);
if fresh {
self.add_tuple(
"folders",
vec![
Arg::Label(folder_label),
Arg::String(normalize_path(folder)),
],
);
path = folder.parent();
child_label = folder_label;
} else {
break;
}
path = folder.parent();
child_label = folder_label;
} else {
break;
}
}
}
}
}
fn location(
&mut self,
file_label: Label,
start_line: usize,
start_column: usize,
end_line: usize,
end_column: usize,
) -> Label {
let (loc_label, fresh) = self.global_id(&format!(
"loc,{{{}}},{},{},{},{}",
file_label, start_line, start_column, end_line, end_column
));
if fresh {
self.add_tuple(
"locations_default",
vec![
Arg::Label(loc_label),
Arg::Label(file_label),
Arg::Int(start_line),
Arg::Int(start_column),
Arg::Int(end_line),
Arg::Int(end_column),
],
);
}
loc_label
}
fn comment(&mut self, text: String) {
self.trap_output.push(TrapEntry::Comment(text));
}
pub fn output(self, writer: &mut dyn Write) -> std::io::Result<()> {
write!(writer, "{}", Program(self.trap_output))
fn location(
writer: &mut trap::Writer,
file_label: trap::Label,
start_line: usize,
start_column: usize,
end_line: usize,
end_column: usize,
) -> trap::Label {
let (loc_label, fresh) = writer.global_id(&format!(
"loc,{{{}}},{},{},{},{}",
file_label, start_line, start_column, end_line, end_column
));
if fresh {
writer.add_tuple(
"locations_default",
vec![
trap::Arg::Label(loc_label),
trap::Arg::Label(file_label),
trap::Arg::Int(start_line),
trap::Arg::Int(start_column),
trap::Arg::Int(end_line),
trap::Arg::Int(end_column),
],
);
}
loc_label
}
/// Extracts the source file at `path`, which is assumed to be canonicalized.
@@ -163,71 +114,43 @@ pub fn extract(
language: Language,
language_prefix: &str,
schema: &NodeTypeMap,
trap_writer: &mut TrapWriter,
trap_writer: &mut trap::Writer,
path: &Path,
source: &[u8],
ranges: &[Range],
) -> std::io::Result<()> {
let path_str = format!("{}", path.display());
let span = span!(
Level::TRACE,
"extract",
file = %path.display()
file = %path_str
);
let _enter = span.enter();
info!("extracting: {}", path.display());
info!("extracting: {}", path_str);
let mut parser = Parser::new();
parser.set_language(language).unwrap();
parser.set_included_ranges(ranges).unwrap();
let tree = parser.parse(&source, None).expect("Failed to parse file");
trap_writer.comment(format!("Auto-generated TRAP file for {}", path.display()));
let file_label = &trap_writer.populate_file(path);
let mut visitor = Visitor {
trap_writer.comment(format!("Auto-generated TRAP file for {}", path_str));
let file_label = populate_file(trap_writer, path);
let mut visitor = Visitor::new(
source,
trap_writer,
// TODO: should we handle path strings that are not valid UTF8 better?
path: format!("{}", path.display()),
file_label: *file_label,
toplevel_child_counter: 0,
stack: Vec::new(),
&path_str,
file_label,
language_prefix,
schema,
};
);
traverse(&tree, &mut visitor);
parser.reset();
Ok(())
}
/// Escapes a string for use in a TRAP key, by replacing special characters with
/// HTML entities.
fn escape_key<'a, S: Into<Cow<'a, str>>>(key: S) -> Cow<'a, str> {
fn needs_escaping(c: char) -> bool {
matches!(c, '&' | '{' | '}' | '"' | '@' | '#')
}
let key = key.into();
if key.contains(needs_escaping) {
let mut escaped = String::with_capacity(2 * key.len());
for c in key.chars() {
match c {
'&' => escaped.push_str("&amp;"),
'{' => escaped.push_str("&lbrace;"),
'}' => escaped.push_str("&rbrace;"),
'"' => escaped.push_str("&quot;"),
'@' => escaped.push_str("&commat;"),
'#' => escaped.push_str("&num;"),
_ => escaped.push(c),
}
}
Cow::Owned(escaped)
} else {
key
}
}
/// Normalizes the path according the common CodeQL specification. Assumes that
/// `path` has already been canonicalized using `std::fs::canonicalize`.
fn normalize_path(path: &Path) -> String {
@@ -267,34 +190,28 @@ fn normalize_path(path: &Path) -> String {
}
}
fn full_id_for_file(path: &Path) -> String {
format!("{};sourcefile", escape_key(&normalize_path(path)))
}
fn full_id_for_folder(path: &Path) -> String {
format!("{};folder", escape_key(&normalize_path(path)))
}
struct ChildNode {
field_name: Option<&'static str>,
label: Label,
label: trap::Label,
type_name: TypeName,
}
struct Visitor<'a> {
/// The file path of the source code (as string)
path: String,
path: &'a str,
/// The label to use whenever we need to refer to the `@file` entity of this
/// source file.
file_label: Label,
file_label: trap::Label,
/// The source code as a UTF-8 byte array
source: &'a [u8],
/// A TrapWriter to accumulate trap entries
trap_writer: &'a mut TrapWriter,
/// A trap::Writer to accumulate trap entries
trap_writer: &'a mut trap::Writer,
/// A counter for top-level child nodes
toplevel_child_counter: usize,
/// Language prefix
language_prefix: &'a str,
/// Language-specific name of the AST info table
ast_node_info_table_name: String,
/// Language-specific name of the tokeninfo table
tokeninfo_table_name: String,
/// A lookup table from type name to node types
schema: &'a NodeTypeMap,
/// A stack for gathering information from child nodes. Whenever a node is
@@ -303,27 +220,48 @@ struct Visitor<'a> {
/// node the list containing the child data is popped from the stack and
/// matched against the dbscheme for the node. If the expectations are met
/// the corresponding row definitions are added to the trap_output.
stack: Vec<(Label, usize, Vec<ChildNode>)>,
stack: Vec<(trap::Label, usize, Vec<ChildNode>)>,
}
impl Visitor<'_> {
impl<'a> Visitor<'a> {
fn new(
source: &'a [u8],
trap_writer: &'a mut trap::Writer,
path: &'a str,
file_label: trap::Label,
language_prefix: &str,
schema: &'a NodeTypeMap,
) -> Visitor<'a> {
Visitor {
path,
file_label,
source,
trap_writer,
toplevel_child_counter: 0,
ast_node_info_table_name: format!("{}_ast_node_info", language_prefix),
tokeninfo_table_name: format!("{}_tokeninfo", language_prefix),
schema,
stack: Vec::new(),
}
}
fn record_parse_error(
&mut self,
error_message: String,
full_error_message: String,
loc: Label,
loc: trap::Label,
) {
error!("{}", full_error_message);
let id = self.trap_writer.fresh_id();
self.trap_writer.add_tuple(
"diagnostics",
vec![
Arg::Label(id),
Arg::Int(40), // severity 40 = error
Arg::String("parse_error".to_string()),
Arg::String(error_message),
Arg::String(full_error_message),
Arg::Label(loc),
trap::Arg::Label(id),
trap::Arg::Int(40), // severity 40 = error
trap::Arg::String("parse_error".to_string()),
trap::Arg::String(error_message),
trap::Arg::String(full_error_message),
trap::Arg::Label(loc),
],
);
}
@@ -335,7 +273,8 @@ impl Visitor<'_> {
node: Node,
) {
let (start_line, start_column, end_line, end_column) = location_for(self.source, node);
let loc = self.trap_writer.location(
let loc = location(
self.trap_writer,
self.file_label,
start_line,
start_column,
@@ -374,7 +313,8 @@ impl Visitor<'_> {
}
let (id, _, child_nodes) = self.stack.pop().expect("Vistor: empty stack");
let (start_line, start_column, end_line, end_column) = location_for(self.source, node);
let loc = self.trap_writer.location(
let loc = location(
self.trap_writer,
self.file_label,
start_line,
start_column,
@@ -402,19 +342,19 @@ impl Visitor<'_> {
match &table.kind {
EntryKind::Token { kind_id, .. } => {
self.trap_writer.add_tuple(
&format!("{}_ast_node_info", self.language_prefix),
&self.ast_node_info_table_name,
vec![
Arg::Label(id),
Arg::Label(parent_id),
Arg::Int(parent_index),
Arg::Label(loc),
trap::Arg::Label(id),
trap::Arg::Label(parent_id),
trap::Arg::Int(parent_index),
trap::Arg::Label(loc),
],
);
self.trap_writer.add_tuple(
&format!("{}_tokeninfo", self.language_prefix),
&self.tokeninfo_table_name,
vec![
Arg::Label(id),
Arg::Int(*kind_id),
trap::Arg::Label(id),
trap::Arg::Int(*kind_id),
sliced_source_arg(self.source, node),
],
);
@@ -425,15 +365,15 @@ impl Visitor<'_> {
} => {
if let Some(args) = self.complex_node(&node, fields, &child_nodes, id) {
self.trap_writer.add_tuple(
&format!("{}_ast_node_info", self.language_prefix),
&self.ast_node_info_table_name,
vec![
Arg::Label(id),
Arg::Label(parent_id),
Arg::Int(parent_index),
Arg::Label(loc),
trap::Arg::Label(id),
trap::Arg::Label(parent_id),
trap::Arg::Int(parent_index),
trap::Arg::Label(loc),
],
);
let mut all_args = vec![Arg::Label(id)];
let mut all_args = vec![trap::Arg::Label(id)];
all_args.extend(args);
self.trap_writer.add_tuple(table_name, all_args);
}
@@ -472,9 +412,9 @@ impl Visitor<'_> {
node: &Node,
fields: &[Field],
child_nodes: &[ChildNode],
parent_id: Label,
) -> Option<Vec<Arg>> {
let mut map: Map<&Option<String>, (&Field, Vec<Arg>)> = Map::new();
parent_id: trap::Label,
) -> Option<Vec<trap::Arg>> {
let mut map: Map<&Option<String>, (&Field, Vec<trap::Arg>)> = Map::new();
for field in fields {
map.insert(&field.name, (field, Vec::new()));
}
@@ -488,9 +428,9 @@ impl Visitor<'_> {
{
// We can safely unwrap because type_matches checks the key is in the map.
let (int_value, _) = int_mapping.get(&child_node.type_name.kind).unwrap();
values.push(Arg::Int(*int_value));
values.push(trap::Arg::Int(*int_value));
} else {
values.push(Arg::Label(child_node.label));
values.push(trap::Arg::Label(child_node.label));
}
} else if field.name.is_some() {
let error_message = format!(
@@ -569,9 +509,9 @@ impl Visitor<'_> {
);
break;
}
let mut args = vec![Arg::Label(parent_id)];
let mut args = vec![trap::Arg::Label(parent_id)];
if *has_index {
args.push(Arg::Int(index))
args.push(trap::Arg::Int(index))
}
args.push(child_value.clone());
self.trap_writer.add_tuple(table_name, args);
@@ -625,9 +565,9 @@ impl Visitor<'_> {
}
// Emit a slice of a source file as an Arg.
fn sliced_source_arg(source: &[u8], n: Node) -> Arg {
fn sliced_source_arg(source: &[u8], n: Node) -> trap::Arg {
let range = n.byte_range();
Arg::String(String::from_utf8_lossy(&source[range.start..range.end]).into_owned())
trap::Arg::String(String::from_utf8_lossy(&source[range.start..range.end]).into_owned())
}
// Emit a pair of `TrapEntry`s for the provided node, appropriately calibrated.
@@ -699,59 +639,6 @@ fn traverse(tree: &Tree, visitor: &mut Visitor) {
}
}
pub struct Program(Vec<TrapEntry>);
impl fmt::Display for Program {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut text = String::new();
for trap_entry in &self.0 {
text.push_str(&format!("{}\n", trap_entry));
}
write!(f, "{}", text)
}
}
enum TrapEntry {
/// Maps the label to a fresh id, e.g. `#123=*`.
FreshId(Label),
/// Maps the label to a key, e.g. `#7=@"foo"`.
MapLabelToKey(Label, String),
/// foo_bar(arg*)
GenericTuple(String, Vec<Arg>),
Comment(String),
}
impl fmt::Display for TrapEntry {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
TrapEntry::FreshId(label) => write!(f, "{}=*", label),
TrapEntry::MapLabelToKey(label, key) => {
write!(f, "{}=@\"{}\"", label, key.replace("\"", "\"\""))
}
TrapEntry::GenericTuple(name, args) => {
write!(f, "{}(", name)?;
for (index, arg) in args.iter().enumerate() {
if index > 0 {
write!(f, ",")?;
}
write!(f, "{}", arg)?;
}
write!(f, ")")
}
TrapEntry::Comment(line) => write!(f, "// {}", line),
}
}
}
#[derive(Debug, Copy, Clone)]
// Identifiers of the form #0, #1...
struct Label(u32);
impl fmt::Display for Label {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "#{:x}", self.0)
}
}
// Numeric indices.
#[derive(Debug, Copy, Clone)]
struct Index(usize);
@@ -761,69 +648,3 @@ impl fmt::Display for Index {
write!(f, "{}", self.0)
}
}
// Some untyped argument to a TrapEntry.
#[derive(Debug, Clone)]
enum Arg {
Label(Label),
Int(usize),
String(String),
}
const MAX_STRLEN: usize = 1048576;
impl fmt::Display for Arg {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Arg::Label(x) => write!(f, "{}", x),
Arg::Int(x) => write!(f, "{}", x),
Arg::String(x) => write!(
f,
"\"{}\"",
limit_string(x, MAX_STRLEN).replace("\"", "\"\"")
),
}
}
}
/// Limit the length (in bytes) of a string. If the string's length in bytes is
/// less than or equal to the limit then the entire string is returned. Otherwise
/// the string is sliced at the provided limit. If there is a multi-byte character
/// at the limit then the returned slice will be slightly shorter than the limit to
/// avoid splitting that multi-byte character.
fn limit_string(string: &str, max_size: usize) -> &str {
if string.len() <= max_size {
return string;
}
let p = string.as_bytes();
let mut index = max_size;
// We want to clip the string at [max_size]; however, the character at that position
// may span several bytes. We need to find the first byte of the character. In UTF-8
// encoded data any byte that matches the bit pattern 10XXXXXX is not a start byte.
// Therefore we decrement the index as long as there are bytes matching this pattern.
// This ensures we cut the string at the border between one character and another.
while index > 0 && (p[index] & 0b11000000) == 0b10000000 {
index -= 1;
}
&string[0..index]
}
#[test]
fn limit_string_test() {
assert_eq!("hello", limit_string(&"hello world".to_owned(), 5));
assert_eq!("hi ☹", limit_string(&"hi ☹☹".to_owned(), 6));
assert_eq!("hi ", limit_string(&"hi ☹☹".to_owned(), 5));
}
#[test]
fn escape_key_test() {
assert_eq!("foo!", escape_key("foo!"));
assert_eq!("foo&lbrace;&rbrace;", escape_key("foo{}"));
assert_eq!("&lbrace;&rbrace;", escape_key("{}"));
assert_eq!("", escape_key(""));
assert_eq!("/path/to/foo.rb", escape_key("/path/to/foo.rb"));
assert_eq!(
"/path/to/foo&amp;&lbrace;&rbrace;&quot;&commat;&num;.rb",
escape_key("/path/to/foo&{}\"@#.rb")
);
}

View File

@@ -1,49 +1,13 @@
mod extractor;
mod trap;
extern crate num_cpus;
use flate2::write::GzEncoder;
use rayon::prelude::*;
use std::fs;
use std::io::{BufRead, BufWriter};
use std::io::BufRead;
use std::path::{Path, PathBuf};
enum TrapCompression {
None,
Gzip,
}
impl TrapCompression {
fn from_env() -> TrapCompression {
match std::env::var("CODEQL_QL_TRAP_COMPRESSION") {
Ok(method) => match TrapCompression::from_string(&method) {
Some(c) => c,
None => {
tracing::error!("Unknown compression method '{}'; using gzip.", &method);
TrapCompression::Gzip
}
},
// Default compression method if the env var isn't set:
Err(_) => TrapCompression::Gzip,
}
}
fn from_string(s: &str) -> Option<TrapCompression> {
match s.to_lowercase().as_ref() {
"none" => Some(TrapCompression::None),
"gzip" => Some(TrapCompression::Gzip),
_ => None,
}
}
fn extension(&self) -> &str {
match self {
TrapCompression::None => "trap",
TrapCompression::Gzip => "trap.gz",
}
}
}
/**
* Gets the number of threads the extractor should use, by reading the
* CODEQL_THREADS environment variable and using it as described in the
@@ -115,7 +79,7 @@ fn main() -> std::io::Result<()> {
.value_of("output-dir")
.expect("missing --output-dir");
let trap_dir = PathBuf::from(trap_dir);
let trap_compression = TrapCompression::from_env();
let trap_compression = trap::Compression::from_env("CODEQL_QL_TRAP_COMPRESSION");
let file_list = matches.value_of("file-list").expect("missing --file-list");
let file_list = fs::File::open(file_list)?;
@@ -140,7 +104,7 @@ fn main() -> std::io::Result<()> {
let src_archive_file = path_for(&src_archive_dir, &path, "");
let source = std::fs::read(&path)?;
let code_ranges = vec![];
let mut trap_writer = extractor::new_trap_writer();
let mut trap_writer = trap::Writer::new();
extractor::extract(
language,
"ql",
@@ -152,33 +116,25 @@ fn main() -> std::io::Result<()> {
)?;
std::fs::create_dir_all(&src_archive_file.parent().unwrap())?;
std::fs::copy(&path, &src_archive_file)?;
write_trap(&trap_dir, path, trap_writer, &trap_compression)
write_trap(&trap_dir, path, &trap_writer, trap_compression)
})
.expect("failed to extract files");
let path = PathBuf::from("extras");
let mut trap_writer = extractor::new_trap_writer();
trap_writer.populate_empty_location();
write_trap(&trap_dir, path, trap_writer, &trap_compression)
let mut trap_writer = trap::Writer::new();
extractor::populate_empty_location(&mut trap_writer);
write_trap(&trap_dir, path, &trap_writer, trap_compression)
}
fn write_trap(
trap_dir: &Path,
path: PathBuf,
trap_writer: extractor::TrapWriter,
trap_compression: &TrapCompression,
trap_writer: &trap::Writer,
trap_compression: trap::Compression,
) -> std::io::Result<()> {
let trap_file = path_for(trap_dir, &path, trap_compression.extension());
std::fs::create_dir_all(&trap_file.parent().unwrap())?;
let trap_file = std::fs::File::create(&trap_file)?;
let mut trap_file = BufWriter::new(trap_file);
match trap_compression {
TrapCompression::None => trap_writer.output(&mut trap_file),
TrapCompression::Gzip => {
let mut compressed_writer = GzEncoder::new(trap_file, flate2::Compression::fast());
trap_writer.output(&mut compressed_writer)
}
}
trap_writer.write_to_file(&trap_file, trap_compression)
}
fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {

272
ql/extractor/src/trap.rs Normal file
View File

@@ -0,0 +1,272 @@
use std::borrow::Cow;
use std::fmt;
use std::io::{BufWriter, Write};
use std::path::Path;
use flate2::write::GzEncoder;
pub struct Writer {
/// The accumulated trap entries
trap_output: Vec<Entry>,
/// A counter for generating fresh labels
counter: u32,
/// cache of global keys
global_keys: std::collections::HashMap<String, Label>,
}
impl Writer {
pub fn new() -> Writer {
Writer {
counter: 0,
trap_output: Vec::new(),
global_keys: std::collections::HashMap::new(),
}
}
pub fn fresh_id(&mut self) -> Label {
let label = Label(self.counter);
self.counter += 1;
self.trap_output.push(Entry::FreshId(label));
label
}
/// Gets a label that will hold the unique ID of the passed string at import time.
/// This can be used for incrementally importable TRAP files -- use globally unique
/// strings to compute a unique ID for table tuples.
///
/// Note: You probably want to make sure that the key strings that you use are disjoint
/// for disjoint column types; the standard way of doing this is to prefix (or append)
/// the column type name to the ID. Thus, you might identify methods in Java by the
/// full ID "methods_com.method.package.DeclaringClass.method(argumentList)".
pub fn global_id(&mut self, key: &str) -> (Label, bool) {
if let Some(label) = self.global_keys.get(key) {
return (*label, false);
}
let label = Label(self.counter);
self.counter += 1;
self.global_keys.insert(key.to_owned(), label);
self.trap_output
.push(Entry::MapLabelToKey(label, key.to_owned()));
(label, true)
}
pub fn add_tuple(&mut self, table_name: &str, args: Vec<Arg>) {
self.trap_output
.push(Entry::GenericTuple(table_name.to_owned(), args))
}
pub fn comment(&mut self, text: String) {
self.trap_output.push(Entry::Comment(text));
}
pub fn write_to_file(&self, path: &Path, compression: Compression) -> std::io::Result<()> {
let trap_file = std::fs::File::create(path)?;
let mut trap_file = BufWriter::new(trap_file);
match compression {
Compression::None => self.write_trap_entries(&mut trap_file),
Compression::Gzip => {
let mut compressed_writer = GzEncoder::new(trap_file, flate2::Compression::fast());
self.write_trap_entries(&mut compressed_writer)
}
}
}
fn write_trap_entries<W: Write>(&self, file: &mut W) -> std::io::Result<()> {
for trap_entry in &self.trap_output {
writeln!(file, "{}", trap_entry)?;
}
std::io::Result::Ok(())
}
}
pub enum Entry {
/// Maps the label to a fresh id, e.g. `#123=*`.
FreshId(Label),
/// Maps the label to a key, e.g. `#7=@"foo"`.
MapLabelToKey(Label, String),
/// foo_bar(arg*)
GenericTuple(String, Vec<Arg>),
Comment(String),
}
impl fmt::Display for Entry {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Entry::FreshId(label) => write!(f, "{}=*", label),
Entry::MapLabelToKey(label, key) => {
write!(f, "{}=@\"{}\"", label, key.replace("\"", "\"\""))
}
Entry::GenericTuple(name, args) => {
write!(f, "{}(", name)?;
for (index, arg) in args.iter().enumerate() {
if index > 0 {
write!(f, ",")?;
}
write!(f, "{}", arg)?;
}
write!(f, ")")
}
Entry::Comment(line) => write!(f, "// {}", line),
}
}
}
#[derive(Debug, Copy, Clone)]
// Identifiers of the form #0, #1...
pub struct Label(u32);
impl fmt::Display for Label {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "#{:x}", self.0)
}
}
// Some untyped argument to a TrapEntry.
#[derive(Debug, Clone)]
pub enum Arg {
Label(Label),
Int(usize),
String(String),
}
const MAX_STRLEN: usize = 1048576;
impl fmt::Display for Arg {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Arg::Label(x) => write!(f, "{}", x),
Arg::Int(x) => write!(f, "{}", x),
Arg::String(x) => write!(
f,
"\"{}\"",
limit_string(x, MAX_STRLEN).replace("\"", "\"\"")
),
}
}
}
pub struct Program(Vec<Entry>);
impl fmt::Display for Program {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut text = String::new();
for trap_entry in &self.0 {
text.push_str(&format!("{}\n", trap_entry));
}
write!(f, "{}", text)
}
}
pub fn full_id_for_file(normalized_path: &str) -> String {
format!("{};sourcefile", escape_key(normalized_path))
}
pub fn full_id_for_folder(normalized_path: &str) -> String {
format!("{};folder", escape_key(normalized_path))
}
/// Escapes a string for use in a TRAP key, by replacing special characters with
/// HTML entities.
fn escape_key<'a, S: Into<Cow<'a, str>>>(key: S) -> Cow<'a, str> {
fn needs_escaping(c: char) -> bool {
matches!(c, '&' | '{' | '}' | '"' | '@' | '#')
}
let key = key.into();
if key.contains(needs_escaping) {
let mut escaped = String::with_capacity(2 * key.len());
for c in key.chars() {
match c {
'&' => escaped.push_str("&amp;"),
'{' => escaped.push_str("&lbrace;"),
'}' => escaped.push_str("&rbrace;"),
'"' => escaped.push_str("&quot;"),
'@' => escaped.push_str("&commat;"),
'#' => escaped.push_str("&num;"),
_ => escaped.push(c),
}
}
Cow::Owned(escaped)
} else {
key
}
}
/// Limit the length (in bytes) of a string. If the string's length in bytes is
/// less than or equal to the limit then the entire string is returned. Otherwise
/// the string is sliced at the provided limit. If there is a multi-byte character
/// at the limit then the returned slice will be slightly shorter than the limit to
/// avoid splitting that multi-byte character.
fn limit_string(string: &str, max_size: usize) -> &str {
if string.len() <= max_size {
return string;
}
let p = string.as_bytes();
let mut index = max_size;
// We want to clip the string at [max_size]; however, the character at that position
// may span several bytes. We need to find the first byte of the character. In UTF-8
// encoded data any byte that matches the bit pattern 10XXXXXX is not a start byte.
// Therefore we decrement the index as long as there are bytes matching this pattern.
// This ensures we cut the string at the border between one character and another.
while index > 0 && (p[index] & 0b11000000) == 0b10000000 {
index -= 1;
}
&string[0..index]
}
#[derive(Clone, Copy)]
pub enum Compression {
None,
Gzip,
}
impl Compression {
pub fn from_env(var_name: &str) -> Compression {
match std::env::var(var_name) {
Ok(method) => match Compression::from_string(&method) {
Some(c) => c,
None => {
tracing::error!("Unknown compression method '{}'; using gzip.", &method);
Compression::Gzip
}
},
// Default compression method if the env var isn't set:
Err(_) => Compression::Gzip,
}
}
pub fn from_string(s: &str) -> Option<Compression> {
match s.to_lowercase().as_ref() {
"none" => Some(Compression::None),
"gzip" => Some(Compression::Gzip),
_ => None,
}
}
pub fn extension(&self) -> &str {
match self {
Compression::None => "trap",
Compression::Gzip => "trap.gz",
}
}
}
#[test]
fn limit_string_test() {
assert_eq!("hello", limit_string(&"hello world".to_owned(), 5));
assert_eq!("hi ☹", limit_string(&"hi ☹☹".to_owned(), 6));
assert_eq!("hi ", limit_string(&"hi ☹☹".to_owned(), 5));
}
#[test]
fn escape_key_test() {
assert_eq!("foo!", escape_key("foo!"));
assert_eq!("foo&lbrace;&rbrace;", escape_key("foo{}"));
assert_eq!("&lbrace;&rbrace;", escape_key("{}"));
assert_eq!("", escape_key(""));
assert_eq!("/path/to/foo.rb", escape_key("/path/to/foo.rb"));
assert_eq!(
"/path/to/foo&amp;&lbrace;&rbrace;&quot;&commat;&num;.rb",
escape_key("/path/to/foo&{}\"@#.rb")
);
}

BIN
ruby/Cargo.lock generated

Binary file not shown.

View File

@@ -11,10 +11,12 @@ flate2 = "1.0"
node-types = { path = "../node-types" }
tree-sitter = "0.19"
tree-sitter-embedded-template = { git = "https://github.com/tree-sitter/tree-sitter-embedded-template.git", rev = "1a538da253d73f896b9f6c0c7d79cda58791ac5c" }
tree-sitter-ruby = { git = "https://github.com/tree-sitter/tree-sitter-ruby.git", rev = "5b305c3cd32db10494cedd2743de6bbe32f1a573" }
tree-sitter-ruby = { git = "https://github.com/tree-sitter/tree-sitter-ruby.git", rev = "e75d04404c9dd71ad68850d5c672b226d5e694f3" }
clap = "3.0"
tracing = "0.1"
tracing-subscriber = { version = "0.3.3", features = ["env-filter"] }
rayon = "1.5.0"
num_cpus = "1.13.0"
regex = "1.5.5"
encoding = "0.2"
lazy_static = "1.4.0"

View File

@@ -1,161 +1,112 @@
use crate::trap;
use node_types::{EntryKind, Field, NodeTypeMap, Storage, TypeName};
use std::borrow::Cow;
use std::collections::BTreeMap as Map;
use std::collections::BTreeSet as Set;
use std::fmt;
use std::io::Write;
use std::path::Path;
use tracing::{error, info, span, Level};
use tree_sitter::{Language, Node, Parser, Range, Tree};
pub struct TrapWriter {
/// The accumulated trap entries
trap_output: Vec<TrapEntry>,
/// A counter for generating fresh labels
counter: u32,
/// cache of global keys
global_keys: std::collections::HashMap<String, Label>,
pub fn populate_file(writer: &mut trap::Writer, absolute_path: &Path) -> trap::Label {
let (file_label, fresh) =
writer.global_id(&trap::full_id_for_file(&normalize_path(absolute_path)));
if fresh {
writer.add_tuple(
"files",
vec![
trap::Arg::Label(file_label),
trap::Arg::String(normalize_path(absolute_path)),
],
);
populate_parent_folders(writer, file_label, absolute_path.parent());
}
file_label
}
pub fn new_trap_writer() -> TrapWriter {
TrapWriter {
counter: 0,
trap_output: Vec::new(),
global_keys: std::collections::HashMap::new(),
fn populate_empty_file(writer: &mut trap::Writer) -> trap::Label {
let (file_label, fresh) = writer.global_id("empty;sourcefile");
if fresh {
writer.add_tuple(
"files",
vec![
trap::Arg::Label(file_label),
trap::Arg::String("".to_string()),
],
);
}
file_label
}
impl TrapWriter {
/// Gets a label that will hold the unique ID of the passed string at import time.
/// This can be used for incrementally importable TRAP files -- use globally unique
/// strings to compute a unique ID for table tuples.
///
/// Note: You probably want to make sure that the key strings that you use are disjoint
/// for disjoint column types; the standard way of doing this is to prefix (or append)
/// the column type name to the ID. Thus, you might identify methods in Java by the
/// full ID "methods_com.method.package.DeclaringClass.method(argumentList)".
pub fn populate_empty_location(writer: &mut trap::Writer) {
let file_label = populate_empty_file(writer);
location(writer, file_label, 0, 0, 0, 0);
}
fn fresh_id(&mut self) -> Label {
let label = Label(self.counter);
self.counter += 1;
self.trap_output.push(TrapEntry::FreshId(label));
label
}
fn global_id(&mut self, key: &str) -> (Label, bool) {
if let Some(label) = self.global_keys.get(key) {
return (*label, false);
}
let label = Label(self.counter);
self.counter += 1;
self.global_keys.insert(key.to_owned(), label);
self.trap_output
.push(TrapEntry::MapLabelToKey(label, key.to_owned()));
(label, true)
}
fn add_tuple(&mut self, table_name: &str, args: Vec<Arg>) {
self.trap_output
.push(TrapEntry::GenericTuple(table_name.to_owned(), args))
}
fn populate_file(&mut self, absolute_path: &Path) -> Label {
let (file_label, fresh) = self.global_id(&full_id_for_file(absolute_path));
if fresh {
self.add_tuple(
"files",
vec![
Arg::Label(file_label),
Arg::String(normalize_path(absolute_path)),
],
);
self.populate_parent_folders(file_label, absolute_path.parent());
}
file_label
}
fn populate_empty_file(&mut self) -> Label {
let (file_label, fresh) = self.global_id("empty;sourcefile");
if fresh {
self.add_tuple(
"files",
vec![Arg::Label(file_label), Arg::String("".to_string())],
);
}
file_label
}
pub fn populate_empty_location(&mut self) {
let file_label = self.populate_empty_file();
self.location(file_label, 0, 0, 0, 0);
}
fn populate_parent_folders(&mut self, child_label: Label, path: Option<&Path>) {
let mut path = path;
let mut child_label = child_label;
loop {
match path {
None => break,
Some(folder) => {
let (folder_label, fresh) = self.global_id(&full_id_for_folder(folder));
self.add_tuple(
"containerparent",
vec![Arg::Label(folder_label), Arg::Label(child_label)],
pub fn populate_parent_folders(
writer: &mut trap::Writer,
child_label: trap::Label,
path: Option<&Path>,
) {
let mut path = path;
let mut child_label = child_label;
loop {
match path {
None => break,
Some(folder) => {
let (folder_label, fresh) =
writer.global_id(&trap::full_id_for_folder(&normalize_path(folder)));
writer.add_tuple(
"containerparent",
vec![
trap::Arg::Label(folder_label),
trap::Arg::Label(child_label),
],
);
if fresh {
writer.add_tuple(
"folders",
vec![
trap::Arg::Label(folder_label),
trap::Arg::String(normalize_path(folder)),
],
);
if fresh {
self.add_tuple(
"folders",
vec![
Arg::Label(folder_label),
Arg::String(normalize_path(folder)),
],
);
path = folder.parent();
child_label = folder_label;
} else {
break;
}
path = folder.parent();
child_label = folder_label;
} else {
break;
}
}
}
}
}
fn location(
&mut self,
file_label: Label,
start_line: usize,
start_column: usize,
end_line: usize,
end_column: usize,
) -> Label {
let (loc_label, fresh) = self.global_id(&format!(
"loc,{{{}}},{},{},{},{}",
file_label, start_line, start_column, end_line, end_column
));
if fresh {
self.add_tuple(
"locations_default",
vec![
Arg::Label(loc_label),
Arg::Label(file_label),
Arg::Int(start_line),
Arg::Int(start_column),
Arg::Int(end_line),
Arg::Int(end_column),
],
);
}
loc_label
}
fn comment(&mut self, text: String) {
self.trap_output.push(TrapEntry::Comment(text));
}
pub fn output(self, writer: &mut dyn Write) -> std::io::Result<()> {
write!(writer, "{}", Program(self.trap_output))
fn location(
writer: &mut trap::Writer,
file_label: trap::Label,
start_line: usize,
start_column: usize,
end_line: usize,
end_column: usize,
) -> trap::Label {
let (loc_label, fresh) = writer.global_id(&format!(
"loc,{{{}}},{},{},{},{}",
file_label, start_line, start_column, end_line, end_column
));
if fresh {
writer.add_tuple(
"locations_default",
vec![
trap::Arg::Label(loc_label),
trap::Arg::Label(file_label),
trap::Arg::Int(start_line),
trap::Arg::Int(start_column),
trap::Arg::Int(end_line),
trap::Arg::Int(end_column),
],
);
}
loc_label
}
/// Extracts the source file at `path`, which is assumed to be canonicalized.
@@ -163,71 +114,43 @@ pub fn extract(
language: Language,
language_prefix: &str,
schema: &NodeTypeMap,
trap_writer: &mut TrapWriter,
trap_writer: &mut trap::Writer,
path: &Path,
source: &[u8],
ranges: &[Range],
) -> std::io::Result<()> {
let path_str = format!("{}", path.display());
let span = span!(
Level::TRACE,
"extract",
file = %path.display()
file = %path_str
);
let _enter = span.enter();
info!("extracting: {}", path.display());
info!("extracting: {}", path_str);
let mut parser = Parser::new();
parser.set_language(language).unwrap();
parser.set_included_ranges(ranges).unwrap();
let tree = parser.parse(&source, None).expect("Failed to parse file");
trap_writer.comment(format!("Auto-generated TRAP file for {}", path.display()));
let file_label = &trap_writer.populate_file(path);
let mut visitor = Visitor {
trap_writer.comment(format!("Auto-generated TRAP file for {}", path_str));
let file_label = populate_file(trap_writer, path);
let mut visitor = Visitor::new(
source,
trap_writer,
// TODO: should we handle path strings that are not valid UTF8 better?
path: format!("{}", path.display()),
file_label: *file_label,
toplevel_child_counter: 0,
stack: Vec::new(),
&path_str,
file_label,
language_prefix,
schema,
};
);
traverse(&tree, &mut visitor);
parser.reset();
Ok(())
}
/// Escapes a string for use in a TRAP key, by replacing special characters with
/// HTML entities.
fn escape_key<'a, S: Into<Cow<'a, str>>>(key: S) -> Cow<'a, str> {
fn needs_escaping(c: char) -> bool {
matches!(c, '&' | '{' | '}' | '"' | '@' | '#')
}
let key = key.into();
if key.contains(needs_escaping) {
let mut escaped = String::with_capacity(2 * key.len());
for c in key.chars() {
match c {
'&' => escaped.push_str("&amp;"),
'{' => escaped.push_str("&lbrace;"),
'}' => escaped.push_str("&rbrace;"),
'"' => escaped.push_str("&quot;"),
'@' => escaped.push_str("&commat;"),
'#' => escaped.push_str("&num;"),
_ => escaped.push(c),
}
}
Cow::Owned(escaped)
} else {
key
}
}
/// Normalizes the path according the common CodeQL specification. Assumes that
/// `path` has already been canonicalized using `std::fs::canonicalize`.
fn normalize_path(path: &Path) -> String {
@@ -267,34 +190,28 @@ fn normalize_path(path: &Path) -> String {
}
}
fn full_id_for_file(path: &Path) -> String {
format!("{};sourcefile", escape_key(&normalize_path(path)))
}
fn full_id_for_folder(path: &Path) -> String {
format!("{};folder", escape_key(&normalize_path(path)))
}
struct ChildNode {
field_name: Option<&'static str>,
label: Label,
label: trap::Label,
type_name: TypeName,
}
struct Visitor<'a> {
/// The file path of the source code (as string)
path: String,
path: &'a str,
/// The label to use whenever we need to refer to the `@file` entity of this
/// source file.
file_label: Label,
file_label: trap::Label,
/// The source code as a UTF-8 byte array
source: &'a [u8],
/// A TrapWriter to accumulate trap entries
trap_writer: &'a mut TrapWriter,
/// A trap::Writer to accumulate trap entries
trap_writer: &'a mut trap::Writer,
/// A counter for top-level child nodes
toplevel_child_counter: usize,
/// Language prefix
language_prefix: &'a str,
/// Language-specific name of the AST info table
ast_node_info_table_name: String,
/// Language-specific name of the tokeninfo table
tokeninfo_table_name: String,
/// A lookup table from type name to node types
schema: &'a NodeTypeMap,
/// A stack for gathering information from child nodes. Whenever a node is
@@ -303,27 +220,48 @@ struct Visitor<'a> {
/// node the list containing the child data is popped from the stack and
/// matched against the dbscheme for the node. If the expectations are met
/// the corresponding row definitions are added to the trap_output.
stack: Vec<(Label, usize, Vec<ChildNode>)>,
stack: Vec<(trap::Label, usize, Vec<ChildNode>)>,
}
impl Visitor<'_> {
impl<'a> Visitor<'a> {
fn new(
source: &'a [u8],
trap_writer: &'a mut trap::Writer,
path: &'a str,
file_label: trap::Label,
language_prefix: &str,
schema: &'a NodeTypeMap,
) -> Visitor<'a> {
Visitor {
path,
file_label,
source,
trap_writer,
toplevel_child_counter: 0,
ast_node_info_table_name: format!("{}_ast_node_info", language_prefix),
tokeninfo_table_name: format!("{}_tokeninfo", language_prefix),
schema,
stack: Vec::new(),
}
}
fn record_parse_error(
&mut self,
error_message: String,
full_error_message: String,
loc: Label,
loc: trap::Label,
) {
error!("{}", full_error_message);
let id = self.trap_writer.fresh_id();
self.trap_writer.add_tuple(
"diagnostics",
vec![
Arg::Label(id),
Arg::Int(40), // severity 40 = error
Arg::String("parse_error".to_string()),
Arg::String(error_message),
Arg::String(full_error_message),
Arg::Label(loc),
trap::Arg::Label(id),
trap::Arg::Int(40), // severity 40 = error
trap::Arg::String("parse_error".to_string()),
trap::Arg::String(error_message),
trap::Arg::String(full_error_message),
trap::Arg::Label(loc),
],
);
}
@@ -335,7 +273,8 @@ impl Visitor<'_> {
node: Node,
) {
let (start_line, start_column, end_line, end_column) = location_for(self.source, node);
let loc = self.trap_writer.location(
let loc = location(
self.trap_writer,
self.file_label,
start_line,
start_column,
@@ -374,7 +313,8 @@ impl Visitor<'_> {
}
let (id, _, child_nodes) = self.stack.pop().expect("Vistor: empty stack");
let (start_line, start_column, end_line, end_column) = location_for(self.source, node);
let loc = self.trap_writer.location(
let loc = location(
self.trap_writer,
self.file_label,
start_line,
start_column,
@@ -402,19 +342,19 @@ impl Visitor<'_> {
match &table.kind {
EntryKind::Token { kind_id, .. } => {
self.trap_writer.add_tuple(
&format!("{}_ast_node_info", self.language_prefix),
&self.ast_node_info_table_name,
vec![
Arg::Label(id),
Arg::Label(parent_id),
Arg::Int(parent_index),
Arg::Label(loc),
trap::Arg::Label(id),
trap::Arg::Label(parent_id),
trap::Arg::Int(parent_index),
trap::Arg::Label(loc),
],
);
self.trap_writer.add_tuple(
&format!("{}_tokeninfo", self.language_prefix),
&self.tokeninfo_table_name,
vec![
Arg::Label(id),
Arg::Int(*kind_id),
trap::Arg::Label(id),
trap::Arg::Int(*kind_id),
sliced_source_arg(self.source, node),
],
);
@@ -425,15 +365,15 @@ impl Visitor<'_> {
} => {
if let Some(args) = self.complex_node(&node, fields, &child_nodes, id) {
self.trap_writer.add_tuple(
&format!("{}_ast_node_info", self.language_prefix),
&self.ast_node_info_table_name,
vec![
Arg::Label(id),
Arg::Label(parent_id),
Arg::Int(parent_index),
Arg::Label(loc),
trap::Arg::Label(id),
trap::Arg::Label(parent_id),
trap::Arg::Int(parent_index),
trap::Arg::Label(loc),
],
);
let mut all_args = vec![Arg::Label(id)];
let mut all_args = vec![trap::Arg::Label(id)];
all_args.extend(args);
self.trap_writer.add_tuple(table_name, all_args);
}
@@ -472,9 +412,9 @@ impl Visitor<'_> {
node: &Node,
fields: &[Field],
child_nodes: &[ChildNode],
parent_id: Label,
) -> Option<Vec<Arg>> {
let mut map: Map<&Option<String>, (&Field, Vec<Arg>)> = Map::new();
parent_id: trap::Label,
) -> Option<Vec<trap::Arg>> {
let mut map: Map<&Option<String>, (&Field, Vec<trap::Arg>)> = Map::new();
for field in fields {
map.insert(&field.name, (field, Vec::new()));
}
@@ -488,9 +428,9 @@ impl Visitor<'_> {
{
// We can safely unwrap because type_matches checks the key is in the map.
let (int_value, _) = int_mapping.get(&child_node.type_name.kind).unwrap();
values.push(Arg::Int(*int_value));
values.push(trap::Arg::Int(*int_value));
} else {
values.push(Arg::Label(child_node.label));
values.push(trap::Arg::Label(child_node.label));
}
} else if field.name.is_some() {
let error_message = format!(
@@ -569,9 +509,9 @@ impl Visitor<'_> {
);
break;
}
let mut args = vec![Arg::Label(parent_id)];
let mut args = vec![trap::Arg::Label(parent_id)];
if *has_index {
args.push(Arg::Int(index))
args.push(trap::Arg::Int(index))
}
args.push(child_value.clone());
self.trap_writer.add_tuple(table_name, args);
@@ -625,9 +565,9 @@ impl Visitor<'_> {
}
// Emit a slice of a source file as an Arg.
fn sliced_source_arg(source: &[u8], n: Node) -> Arg {
fn sliced_source_arg(source: &[u8], n: Node) -> trap::Arg {
let range = n.byte_range();
Arg::String(String::from_utf8_lossy(&source[range.start..range.end]).into_owned())
trap::Arg::String(String::from_utf8_lossy(&source[range.start..range.end]).into_owned())
}
// Emit a pair of `TrapEntry`s for the provided node, appropriately calibrated.
@@ -699,59 +639,6 @@ fn traverse(tree: &Tree, visitor: &mut Visitor) {
}
}
pub struct Program(Vec<TrapEntry>);
impl fmt::Display for Program {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut text = String::new();
for trap_entry in &self.0 {
text.push_str(&format!("{}\n", trap_entry));
}
write!(f, "{}", text)
}
}
enum TrapEntry {
/// Maps the label to a fresh id, e.g. `#123=*`.
FreshId(Label),
/// Maps the label to a key, e.g. `#7=@"foo"`.
MapLabelToKey(Label, String),
/// foo_bar(arg*)
GenericTuple(String, Vec<Arg>),
Comment(String),
}
impl fmt::Display for TrapEntry {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
TrapEntry::FreshId(label) => write!(f, "{}=*", label),
TrapEntry::MapLabelToKey(label, key) => {
write!(f, "{}=@\"{}\"", label, key.replace("\"", "\"\""))
}
TrapEntry::GenericTuple(name, args) => {
write!(f, "{}(", name)?;
for (index, arg) in args.iter().enumerate() {
if index > 0 {
write!(f, ",")?;
}
write!(f, "{}", arg)?;
}
write!(f, ")")
}
TrapEntry::Comment(line) => write!(f, "// {}", line),
}
}
}
#[derive(Debug, Copy, Clone)]
// Identifiers of the form #0, #1...
struct Label(u32);
impl fmt::Display for Label {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "#{:x}", self.0)
}
}
// Numeric indices.
#[derive(Debug, Copy, Clone)]
struct Index(usize);
@@ -761,69 +648,3 @@ impl fmt::Display for Index {
write!(f, "{}", self.0)
}
}
// Some untyped argument to a TrapEntry.
#[derive(Debug, Clone)]
enum Arg {
Label(Label),
Int(usize),
String(String),
}
const MAX_STRLEN: usize = 1048576;
impl fmt::Display for Arg {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Arg::Label(x) => write!(f, "{}", x),
Arg::Int(x) => write!(f, "{}", x),
Arg::String(x) => write!(
f,
"\"{}\"",
limit_string(x, MAX_STRLEN).replace("\"", "\"\"")
),
}
}
}
/// Limit the length (in bytes) of a string. If the string's length in bytes is
/// less than or equal to the limit then the entire string is returned. Otherwise
/// the string is sliced at the provided limit. If there is a multi-byte character
/// at the limit then the returned slice will be slightly shorter than the limit to
/// avoid splitting that multi-byte character.
fn limit_string(string: &str, max_size: usize) -> &str {
if string.len() <= max_size {
return string;
}
let p = string.as_bytes();
let mut index = max_size;
// We want to clip the string at [max_size]; however, the character at that position
// may span several bytes. We need to find the first byte of the character. In UTF-8
// encoded data any byte that matches the bit pattern 10XXXXXX is not a start byte.
// Therefore we decrement the index as long as there are bytes matching this pattern.
// This ensures we cut the string at the border between one character and another.
while index > 0 && (p[index] & 0b11000000) == 0b10000000 {
index -= 1;
}
&string[0..index]
}
#[test]
fn limit_string_test() {
assert_eq!("hello", limit_string(&"hello world".to_owned(), 5));
assert_eq!("hi ☹", limit_string(&"hi ☹☹".to_owned(), 6));
assert_eq!("hi ", limit_string(&"hi ☹☹".to_owned(), 5));
}
#[test]
fn escape_key_test() {
assert_eq!("foo!", escape_key("foo!"));
assert_eq!("foo&lbrace;&rbrace;", escape_key("foo{}"));
assert_eq!("&lbrace;&rbrace;", escape_key("{}"));
assert_eq!("", escape_key(""));
assert_eq!("/path/to/foo.rb", escape_key("/path/to/foo.rb"));
assert_eq!(
"/path/to/foo&amp;&lbrace;&rbrace;&quot;&commat;&num;.rb",
escape_key("/path/to/foo&{}\"@#.rb")
);
}

View File

@@ -1,51 +1,19 @@
mod extractor;
mod trap;
#[macro_use]
extern crate lazy_static;
extern crate num_cpus;
use clap::arg;
use flate2::write::GzEncoder;
use encoding::{self};
use rayon::prelude::*;
use std::borrow::Cow;
use std::fs;
use std::io::{BufRead, BufWriter};
use std::io::BufRead;
use std::path::{Path, PathBuf};
use tree_sitter::{Language, Parser, Range};
enum TrapCompression {
None,
Gzip,
}
impl TrapCompression {
fn from_env() -> TrapCompression {
match std::env::var("CODEQL_RUBY_TRAP_COMPRESSION") {
Ok(method) => match TrapCompression::from_string(&method) {
Some(c) => c,
None => {
tracing::error!("Unknown compression method '{}'; using gzip.", &method);
TrapCompression::Gzip
}
},
// Default compression method if the env var isn't set:
Err(_) => TrapCompression::Gzip,
}
}
fn from_string(s: &str) -> Option<TrapCompression> {
match s.to_lowercase().as_ref() {
"none" => Some(TrapCompression::None),
"gzip" => Some(TrapCompression::Gzip),
_ => None,
}
}
fn extension(&self) -> &str {
match self {
TrapCompression::None => "trap",
TrapCompression::Gzip => "trap.gz",
}
}
}
/**
* Gets the number of threads the extractor should use, by reading the
* CODEQL_THREADS environment variable and using it as described in the
@@ -75,6 +43,21 @@ fn num_codeql_threads() -> usize {
}
}
lazy_static! {
static ref CP_NUMBER: regex::Regex = regex::Regex::new("cp([0-9]+)").unwrap();
}
fn encoding_from_name(encoding_name: &str) -> Option<&(dyn encoding::Encoding + Send + Sync)> {
match encoding::label::encoding_from_whatwg_label(encoding_name) {
s @ Some(_) => s,
None => CP_NUMBER.captures(encoding_name).and_then(|cap| {
encoding::label::encoding_from_windows_code_page(
str::parse(cap.get(1).unwrap().as_str()).unwrap(),
)
}),
}
}
fn main() -> std::io::Result<()> {
tracing_subscriber::fmt()
.with_target(false)
@@ -118,7 +101,7 @@ fn main() -> std::io::Result<()> {
.value_of("output-dir")
.expect("missing --output-dir");
let trap_dir = PathBuf::from(trap_dir);
let trap_compression = TrapCompression::from_env();
let trap_compression = trap::Compression::from_env("CODEQL_RUBY_TRAP_COMPRESSION");
let file_list = matches.value_of("file-list").expect("missing --file-list");
let file_list = fs::File::open(file_list)?;
@@ -140,8 +123,9 @@ fn main() -> std::io::Result<()> {
let path = PathBuf::from(line).canonicalize()?;
let src_archive_file = path_for(&src_archive_dir, &path, "");
let mut source = std::fs::read(&path)?;
let mut needs_conversion = false;
let code_ranges;
let mut trap_writer = extractor::new_trap_writer();
let mut trap_writer = trap::Writer::new();
if path.extension().map_or(false, |x| x == "erb") {
tracing::info!("scanning: {}", path.display());
extractor::extract(
@@ -168,6 +152,43 @@ fn main() -> std::io::Result<()> {
}
code_ranges = ranges;
} else {
if let Some(encoding_name) = scan_coding_comment(&source) {
// If the input is already UTF-8 then there is no need to recode the source
// If the declared encoding is 'binary' or 'ascii-8bit' then it is not clear how
// to interpret characters. In this case it is probably best to leave the input
// unchanged.
if !encoding_name.eq_ignore_ascii_case("utf-8")
&& !encoding_name.eq_ignore_ascii_case("ascii-8bit")
&& !encoding_name.eq_ignore_ascii_case("binary")
{
if let Some(encoding) = encoding_from_name(&encoding_name) {
needs_conversion =
encoding.whatwg_name().unwrap_or_default() != "utf-8";
if needs_conversion {
match encoding
.decode(&source, encoding::types::DecoderTrap::Replace)
{
Ok(str) => source = str.as_bytes().to_owned(),
Err(msg) => {
needs_conversion = false;
tracing::warn!(
"{}: character decoding failure: {} ({})",
&path.to_string_lossy(),
msg,
&encoding_name
);
}
}
}
} else {
tracing::warn!(
"{}: unknown character encoding: '{}'",
&path.to_string_lossy(),
&encoding_name
);
}
}
}
code_ranges = vec![];
}
extractor::extract(
@@ -180,34 +201,30 @@ fn main() -> std::io::Result<()> {
&code_ranges,
)?;
std::fs::create_dir_all(&src_archive_file.parent().unwrap())?;
std::fs::copy(&path, &src_archive_file)?;
write_trap(&trap_dir, path, trap_writer, &trap_compression)
if needs_conversion {
std::fs::write(&src_archive_file, &source)?;
} else {
std::fs::copy(&path, &src_archive_file)?;
}
write_trap(&trap_dir, path, &trap_writer, trap_compression)
})
.expect("failed to extract files");
let path = PathBuf::from("extras");
let mut trap_writer = extractor::new_trap_writer();
trap_writer.populate_empty_location();
write_trap(&trap_dir, path, trap_writer, &trap_compression)
let mut trap_writer = trap::Writer::new();
extractor::populate_empty_location(&mut trap_writer);
write_trap(&trap_dir, path, &trap_writer, trap_compression)
}
fn write_trap(
trap_dir: &Path,
path: PathBuf,
trap_writer: extractor::TrapWriter,
trap_compression: &TrapCompression,
trap_writer: &trap::Writer,
trap_compression: trap::Compression,
) -> std::io::Result<()> {
let trap_file = path_for(trap_dir, &path, trap_compression.extension());
std::fs::create_dir_all(&trap_file.parent().unwrap())?;
let trap_file = std::fs::File::create(&trap_file)?;
let mut trap_file = BufWriter::new(trap_file);
match trap_compression {
TrapCompression::None => trap_writer.output(&mut trap_file),
TrapCompression::Gzip => {
let mut compressed_writer = GzEncoder::new(trap_file, flate2::Compression::fast());
trap_writer.output(&mut compressed_writer)
}
}
trap_writer.write_to_file(&trap_file, trap_compression)
}
fn scan_erb(
@@ -299,3 +316,143 @@ fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {
}
result
}
fn skip_space(content: &[u8], index: usize) -> usize {
let mut index = index;
while index < content.len() {
let c = content[index] as char;
// white space except \n
let is_space = c == ' ' || ('\t'..='\r').contains(&c) && c != '\n';
if !is_space {
break;
}
index += 1;
}
index
}
fn scan_coding_comment(content: &[u8]) -> std::option::Option<Cow<str>> {
let mut index = 0;
// skip UTF-8 BOM marker if there is one
if content.len() >= 3 && content[0] == 0xef && content[1] == 0xbb && content[2] == 0xbf {
index += 3;
}
// skip #! line if there is one
if index + 1 < content.len()
&& content[index] as char == '#'
&& content[index + 1] as char == '!'
{
index += 2;
while index < content.len() && content[index] as char != '\n' {
index += 1
}
index += 1
}
index = skip_space(content, index);
if index >= content.len() || content[index] as char != '#' {
return None;
}
index += 1;
const CODING: [char; 12] = ['C', 'c', 'O', 'o', 'D', 'd', 'I', 'i', 'N', 'n', 'G', 'g'];
let mut word_index = 0;
while index < content.len() && word_index < CODING.len() && content[index] as char != '\n' {
if content[index] as char == CODING[word_index]
|| content[index] as char == CODING[word_index + 1]
{
word_index += 2
} else {
word_index = 0;
}
index += 1;
}
if word_index < CODING.len() {
return None;
}
index = skip_space(content, index);
if index < content.len() && content[index] as char != ':' && content[index] as char != '=' {
return None;
}
index += 1;
index = skip_space(content, index);
let start = index;
while index < content.len() {
let c = content[index] as char;
if c == '-' || c == '_' || c.is_ascii_alphanumeric() {
index += 1;
} else {
break;
}
}
if index > start {
return Some(String::from_utf8_lossy(&content[start..index]));
}
None
}
#[test]
fn test_scan_coding_comment() {
let text = "# encoding: utf-8";
let result = scan_coding_comment(text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = "#coding:utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = "# foo\n# encoding: utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, None);
let text = "# encoding: latin1 encoding: utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("latin1".into()));
let text = "# encoding: nonsense";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("nonsense".into()));
let text = "# coding = utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = "# CODING = utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = "# CoDiNg = utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = "# blah blahblahcoding = utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
// unicode BOM is ignored
let text = "\u{FEFF}# encoding: utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = "\u{FEFF} # encoding: utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = "#! /usr/bin/env ruby\n # encoding: utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = "\u{FEFF}#! /usr/bin/env ruby\n # encoding: utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
// A #! must be the first thing on a line, otherwise it's a normal comment
let text = " #! /usr/bin/env ruby encoding = utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = " #! /usr/bin/env ruby \n # encoding = utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, None);
}

272
ruby/extractor/src/trap.rs Normal file
View File

@@ -0,0 +1,272 @@
use std::borrow::Cow;
use std::fmt;
use std::io::{BufWriter, Write};
use std::path::Path;
use flate2::write::GzEncoder;
pub struct Writer {
/// The accumulated trap entries
trap_output: Vec<Entry>,
/// A counter for generating fresh labels
counter: u32,
/// cache of global keys
global_keys: std::collections::HashMap<String, Label>,
}
impl Writer {
pub fn new() -> Writer {
Writer {
counter: 0,
trap_output: Vec::new(),
global_keys: std::collections::HashMap::new(),
}
}
pub fn fresh_id(&mut self) -> Label {
let label = Label(self.counter);
self.counter += 1;
self.trap_output.push(Entry::FreshId(label));
label
}
/// Gets a label that will hold the unique ID of the passed string at import time.
/// This can be used for incrementally importable TRAP files -- use globally unique
/// strings to compute a unique ID for table tuples.
///
/// Note: You probably want to make sure that the key strings that you use are disjoint
/// for disjoint column types; the standard way of doing this is to prefix (or append)
/// the column type name to the ID. Thus, you might identify methods in Java by the
/// full ID "methods_com.method.package.DeclaringClass.method(argumentList)".
pub fn global_id(&mut self, key: &str) -> (Label, bool) {
if let Some(label) = self.global_keys.get(key) {
return (*label, false);
}
let label = Label(self.counter);
self.counter += 1;
self.global_keys.insert(key.to_owned(), label);
self.trap_output
.push(Entry::MapLabelToKey(label, key.to_owned()));
(label, true)
}
pub fn add_tuple(&mut self, table_name: &str, args: Vec<Arg>) {
self.trap_output
.push(Entry::GenericTuple(table_name.to_owned(), args))
}
pub fn comment(&mut self, text: String) {
self.trap_output.push(Entry::Comment(text));
}
pub fn write_to_file(&self, path: &Path, compression: Compression) -> std::io::Result<()> {
let trap_file = std::fs::File::create(path)?;
let mut trap_file = BufWriter::new(trap_file);
match compression {
Compression::None => self.write_trap_entries(&mut trap_file),
Compression::Gzip => {
let mut compressed_writer = GzEncoder::new(trap_file, flate2::Compression::fast());
self.write_trap_entries(&mut compressed_writer)
}
}
}
fn write_trap_entries<W: Write>(&self, file: &mut W) -> std::io::Result<()> {
for trap_entry in &self.trap_output {
writeln!(file, "{}", trap_entry)?;
}
std::io::Result::Ok(())
}
}
pub enum Entry {
/// Maps the label to a fresh id, e.g. `#123=*`.
FreshId(Label),
/// Maps the label to a key, e.g. `#7=@"foo"`.
MapLabelToKey(Label, String),
/// foo_bar(arg*)
GenericTuple(String, Vec<Arg>),
Comment(String),
}
impl fmt::Display for Entry {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Entry::FreshId(label) => write!(f, "{}=*", label),
Entry::MapLabelToKey(label, key) => {
write!(f, "{}=@\"{}\"", label, key.replace("\"", "\"\""))
}
Entry::GenericTuple(name, args) => {
write!(f, "{}(", name)?;
for (index, arg) in args.iter().enumerate() {
if index > 0 {
write!(f, ",")?;
}
write!(f, "{}", arg)?;
}
write!(f, ")")
}
Entry::Comment(line) => write!(f, "// {}", line),
}
}
}
#[derive(Debug, Copy, Clone)]
// Identifiers of the form #0, #1...
pub struct Label(u32);
impl fmt::Display for Label {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "#{:x}", self.0)
}
}
// Some untyped argument to a TrapEntry.
#[derive(Debug, Clone)]
pub enum Arg {
Label(Label),
Int(usize),
String(String),
}
const MAX_STRLEN: usize = 1048576;
impl fmt::Display for Arg {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Arg::Label(x) => write!(f, "{}", x),
Arg::Int(x) => write!(f, "{}", x),
Arg::String(x) => write!(
f,
"\"{}\"",
limit_string(x, MAX_STRLEN).replace("\"", "\"\"")
),
}
}
}
pub struct Program(Vec<Entry>);
impl fmt::Display for Program {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut text = String::new();
for trap_entry in &self.0 {
text.push_str(&format!("{}\n", trap_entry));
}
write!(f, "{}", text)
}
}
pub fn full_id_for_file(normalized_path: &str) -> String {
format!("{};sourcefile", escape_key(normalized_path))
}
pub fn full_id_for_folder(normalized_path: &str) -> String {
format!("{};folder", escape_key(normalized_path))
}
/// Escapes a string for use in a TRAP key, by replacing special characters with
/// HTML entities.
fn escape_key<'a, S: Into<Cow<'a, str>>>(key: S) -> Cow<'a, str> {
fn needs_escaping(c: char) -> bool {
matches!(c, '&' | '{' | '}' | '"' | '@' | '#')
}
let key = key.into();
if key.contains(needs_escaping) {
let mut escaped = String::with_capacity(2 * key.len());
for c in key.chars() {
match c {
'&' => escaped.push_str("&amp;"),
'{' => escaped.push_str("&lbrace;"),
'}' => escaped.push_str("&rbrace;"),
'"' => escaped.push_str("&quot;"),
'@' => escaped.push_str("&commat;"),
'#' => escaped.push_str("&num;"),
_ => escaped.push(c),
}
}
Cow::Owned(escaped)
} else {
key
}
}
/// Limit the length (in bytes) of a string. If the string's length in bytes is
/// less than or equal to the limit then the entire string is returned. Otherwise
/// the string is sliced at the provided limit. If there is a multi-byte character
/// at the limit then the returned slice will be slightly shorter than the limit to
/// avoid splitting that multi-byte character.
fn limit_string(string: &str, max_size: usize) -> &str {
if string.len() <= max_size {
return string;
}
let p = string.as_bytes();
let mut index = max_size;
// We want to clip the string at [max_size]; however, the character at that position
// may span several bytes. We need to find the first byte of the character. In UTF-8
// encoded data any byte that matches the bit pattern 10XXXXXX is not a start byte.
// Therefore we decrement the index as long as there are bytes matching this pattern.
// This ensures we cut the string at the border between one character and another.
while index > 0 && (p[index] & 0b11000000) == 0b10000000 {
index -= 1;
}
&string[0..index]
}
#[derive(Clone, Copy)]
pub enum Compression {
None,
Gzip,
}
impl Compression {
pub fn from_env(var_name: &str) -> Compression {
match std::env::var(var_name) {
Ok(method) => match Compression::from_string(&method) {
Some(c) => c,
None => {
tracing::error!("Unknown compression method '{}'; using gzip.", &method);
Compression::Gzip
}
},
// Default compression method if the env var isn't set:
Err(_) => Compression::Gzip,
}
}
pub fn from_string(s: &str) -> Option<Compression> {
match s.to_lowercase().as_ref() {
"none" => Some(Compression::None),
"gzip" => Some(Compression::Gzip),
_ => None,
}
}
pub fn extension(&self) -> &str {
match self {
Compression::None => "trap",
Compression::Gzip => "trap.gz",
}
}
}
#[test]
fn limit_string_test() {
assert_eq!("hello", limit_string(&"hello world".to_owned(), 5));
assert_eq!("hi ☹", limit_string(&"hi ☹☹".to_owned(), 6));
assert_eq!("hi ", limit_string(&"hi ☹☹".to_owned(), 5));
}
#[test]
fn escape_key_test() {
assert_eq!("foo!", escape_key("foo!"));
assert_eq!("foo&lbrace;&rbrace;", escape_key("foo{}"));
assert_eq!("&lbrace;&rbrace;", escape_key("{}"));
assert_eq!("", escape_key(""));
assert_eq!("/path/to/foo.rb", escape_key("/path/to/foo.rb"));
assert_eq!(
"/path/to/foo&amp;&lbrace;&rbrace;&quot;&commat;&num;.rb",
escape_key("/path/to/foo&{}\"@#.rb")
);
}

View File

@@ -12,4 +12,4 @@ node-types = { path = "../node-types" }
tracing = "0.1"
tracing-subscriber = { version = "0.3.3", features = ["env-filter"] }
tree-sitter-embedded-template = { git = "https://github.com/tree-sitter/tree-sitter-embedded-template.git", rev = "1a538da253d73f896b9f6c0c7d79cda58791ac5c" }
tree-sitter-ruby = { git = "https://github.com/tree-sitter/tree-sitter-ruby.git", rev = "5b305c3cd32db10494cedd2743de6bbe32f1a573" }
tree-sitter-ruby = { git = "https://github.com/tree-sitter/tree-sitter-ruby.git", rev = "e75d04404c9dd71ad68850d5c672b226d5e694f3" }

View File

@@ -0,0 +1,5 @@
---
category: minorAnalysis
---
- Calls to `ActiveRecord::Relation#annotate` are now recognized as`SqlExecution`s so that it will be considered as a sink for queries like rb/sql-injection.

View File

@@ -133,6 +133,11 @@ private Expr sqlFragmentArgument(MethodCall call) {
or
methodName = "reload" and
result = call.getKeywordArgument("lock")
or
// Calls to `annotate` can be used to add block comments to SQL queries. These are potentially vulnerable to
// SQLi if user supplied input is passed in as an argument.
methodName = "annotate" and
result = call.getArgument(_)
)
)
}

View File

@@ -1762,6 +1762,12 @@ escape_sequences/escapes.rb:
# 93| getStmt: [SymbolLiteral] :"\C-?"
# 93| getComponent: [StringEscapeSequenceComponent] \C
# 93| getComponent: [StringTextComponent] -?
misc/iso-8859-15.rb:
# 1| [Toplevel] iso-8859-15.rb
# 4| getStmt: [MethodCall] call to print
# 4| getReceiver: [SelfVariableAccess] self
# 4| getArgument: [StringLiteral] "EUR = €"
# 4| getComponent: [StringTextComponent] EUR = €
literals/literals.rb:
# 1| [Toplevel] literals.rb
# 2| getStmt: [NilLiteral] nil

View File

@@ -4604,6 +4604,17 @@ literals/literals.rb:
# 193| cat file.txt
# 193|
# 195| 1: [HeredocEnd] SCRIPT
misc/iso-8859-15.rb:
# 1| [Program] Program
# 4| 0: [Call] Call
# 4| 0: [Identifier] print
# 4| 1: [ArgumentList] ArgumentList
# 4| 0: [String] String
# 4| 0: [ReservedWord] "
# 4| 1: [StringContent] EUR = €
# 4| 2: [ReservedWord] "
# 1| [Comment] #! /usr/bin/ruby
# 2| [Comment] # coding: iso-8859-15
misc/misc.erb:
# 2| [Program] Program
# 2| 0: [Call] Call

View File

@@ -717,6 +717,7 @@ exprValue
| literals/literals.rb:198:8:198:8 | 5 | 5 | int |
| literals/literals.rb:199:2:199:2 | :y | :y | symbol |
| literals/literals.rb:199:7:199:7 | :Z | :Z | symbol |
| misc/iso-8859-15.rb:4:7:4:17 | "EUR = \u20ac" | EUR = \u20ac | string |
| misc/misc.erb:2:15:2:37 | "main_include_admin.js" | main_include_admin.js | string |
| misc/misc.rb:1:7:1:11 | "bar" | bar | string |
| misc/misc.rb:3:7:3:9 | foo | foo | string |
@@ -1592,6 +1593,7 @@ exprCfgNodeValue
| literals/literals.rb:198:8:198:8 | 5 | 5 | int |
| literals/literals.rb:199:2:199:2 | :y | :y | symbol |
| literals/literals.rb:199:7:199:7 | :Z | :Z | symbol |
| misc/iso-8859-15.rb:4:7:4:17 | "EUR = \u20ac" | EUR = \u20ac | string |
| misc/misc.erb:2:15:2:37 | "main_include_admin.js" | main_include_admin.js | string |
| misc/misc.rb:1:7:1:11 | "bar" | bar | string |
| misc/misc.rb:3:7:3:9 | foo | foo | string |

View File

@@ -0,0 +1,4 @@
#! /usr/bin/ruby
# coding: iso-8859-15
print "EUR = <20>"

View File

@@ -2,6 +2,7 @@ actionControllerControllerClasses
| ActiveRecord.rb:23:1:39:3 | FooController |
| ActiveRecord.rb:41:1:64:3 | BarController |
| ActiveRecord.rb:66:1:70:3 | BazController |
| ActiveRecord.rb:72:1:80:3 | AnnotatedController |
| app/controllers/comments_controller.rb:1:1:7:3 | CommentsController |
| app/controllers/foo/bars_controller.rb:3:1:39:3 | BarsController |
| app/controllers/photos_controller.rb:1:1:4:3 | PhotosController |
@@ -12,6 +13,8 @@ actionControllerActionMethods
| ActiveRecord.rb:42:3:47:5 | some_other_request_handler |
| ActiveRecord.rb:49:3:63:5 | safe_paths |
| ActiveRecord.rb:67:3:69:5 | yet_another_handler |
| ActiveRecord.rb:73:3:75:5 | index |
| ActiveRecord.rb:77:3:79:5 | unsafe_action |
| app/controllers/comments_controller.rb:2:3:3:5 | index |
| app/controllers/comments_controller.rb:5:3:6:5 | show |
| app/controllers/foo/bars_controller.rb:5:3:7:5 | index |
@@ -38,6 +41,7 @@ paramsCalls
| ActiveRecord.rb:59:12:59:17 | call to params |
| ActiveRecord.rb:62:15:62:20 | call to params |
| ActiveRecord.rb:68:21:68:26 | call to params |
| ActiveRecord.rb:78:59:78:64 | call to params |
| app/controllers/foo/bars_controller.rb:13:21:13:26 | call to params |
| app/controllers/foo/bars_controller.rb:14:10:14:15 | call to params |
| app/controllers/foo/bars_controller.rb:21:21:21:26 | call to params |
@@ -57,6 +61,7 @@ paramsSources
| ActiveRecord.rb:59:12:59:17 | call to params |
| ActiveRecord.rb:62:15:62:20 | call to params |
| ActiveRecord.rb:68:21:68:26 | call to params |
| ActiveRecord.rb:78:59:78:64 | call to params |
| app/controllers/foo/bars_controller.rb:13:21:13:26 | call to params |
| app/controllers/foo/bars_controller.rb:14:10:14:15 | call to params |
| app/controllers/foo/bars_controller.rb:21:21:21:26 | call to params |

View File

@@ -22,6 +22,7 @@ activeRecordSqlExecutionRanges
| ActiveRecord.rb:46:20:46:32 | ... + ... |
| ActiveRecord.rb:52:16:52:28 | "name #{...}" |
| ActiveRecord.rb:56:20:56:39 | "username = #{...}" |
| ActiveRecord.rb:78:27:78:76 | "this is an unsafe annotation:..." |
activeRecordModelClassMethodCalls
| ActiveRecord.rb:2:3:2:17 | call to has_many |
| ActiveRecord.rb:6:3:6:24 | call to belongs_to |
@@ -44,6 +45,8 @@ activeRecordModelClassMethodCalls
| ActiveRecord.rb:60:5:60:33 | call to find_by |
| ActiveRecord.rb:62:5:62:34 | call to find |
| ActiveRecord.rb:68:5:68:45 | call to delete_by |
| ActiveRecord.rb:74:13:74:54 | call to annotate |
| ActiveRecord.rb:78:13:78:77 | call to annotate |
potentiallyUnsafeSqlExecutingMethodCall
| ActiveRecord.rb:9:5:9:68 | call to find |
| ActiveRecord.rb:19:5:19:25 | call to destroy_by |
@@ -55,6 +58,7 @@ potentiallyUnsafeSqlExecutingMethodCall
| ActiveRecord.rb:46:5:46:33 | call to delete_by |
| ActiveRecord.rb:52:5:52:29 | call to order |
| ActiveRecord.rb:56:7:56:40 | call to find_by |
| ActiveRecord.rb:78:13:78:77 | call to annotate |
activeRecordModelInstantiations
| ActiveRecord.rb:9:5:9:68 | call to find | ActiveRecord.rb:5:1:15:3 | User |
| ActiveRecord.rb:13:5:13:40 | call to find_by | ActiveRecord.rb:1:1:3:3 | UserGroup |

View File

@@ -68,3 +68,13 @@ class BazController < BarController
Admin.delete_by(params[:admin_condition])
end
end
class AnnotatedController < ActionController::Base
def index
users = User.annotate("this is a safe annotation")
end
def unsafe_action
users = User.annotate("this is an unsafe annotation:#{params[:comment]}")
end
end

View File

@@ -137,3 +137,17 @@ class BazController < BarController
Admin.delete_by(params[:admin_condition])
end
end
class AnnotatedController < ActionController::Base
def index
name = params[:user_name]
# GOOD: string literal arguments not controlled by user are safe for annotations
users = User.annotate("this is a safe annotation").find_by(user_name: name)
end
def unsafe_action
name = params[:user_name]
# BAD: user input passed into annotations are vulnerable to SQLi
users = User.annotate("this is an unsafe annotation:#{params[:comment]}").find_by(user_name: name)
end
end

View File

@@ -31,6 +31,8 @@ edges
| ActiveRecordInjection.rb:99:11:99:17 | ...[...] : | ActiveRecordInjection.rb:104:20:104:32 | ... + ... |
| ActiveRecordInjection.rb:137:21:137:26 | call to params : | ActiveRecordInjection.rb:137:21:137:44 | ...[...] : |
| ActiveRecordInjection.rb:137:21:137:44 | ...[...] : | ActiveRecordInjection.rb:20:22:20:30 | condition : |
| ActiveRecordInjection.rb:151:59:151:64 | call to params : | ActiveRecordInjection.rb:151:59:151:74 | ...[...] : |
| ActiveRecordInjection.rb:151:59:151:74 | ...[...] : | ActiveRecordInjection.rb:151:27:151:76 | "this is an unsafe annotation:..." |
nodes
| ActiveRecordInjection.rb:8:25:8:28 | name : | semmle.label | name : |
| ActiveRecordInjection.rb:8:31:8:34 | pass : | semmle.label | pass : |
@@ -80,6 +82,9 @@ nodes
| ActiveRecordInjection.rb:104:20:104:32 | ... + ... | semmle.label | ... + ... |
| ActiveRecordInjection.rb:137:21:137:26 | call to params : | semmle.label | call to params : |
| ActiveRecordInjection.rb:137:21:137:44 | ...[...] : | semmle.label | ...[...] : |
| ActiveRecordInjection.rb:151:27:151:76 | "this is an unsafe annotation:..." | semmle.label | "this is an unsafe annotation:..." |
| ActiveRecordInjection.rb:151:59:151:64 | call to params : | semmle.label | call to params : |
| ActiveRecordInjection.rb:151:59:151:74 | ...[...] : | semmle.label | ...[...] : |
subpaths
#select
| ActiveRecordInjection.rb:10:33:10:67 | "name='#{...}' and pass='#{...}'" | ActiveRecordInjection.rb:70:23:70:28 | call to params : | ActiveRecordInjection.rb:10:33:10:67 | "name='#{...}' and pass='#{...}'" | This SQL query depends on $@. | ActiveRecordInjection.rb:70:23:70:28 | call to params | a user-provided value |
@@ -99,3 +104,4 @@ subpaths
| ActiveRecordInjection.rb:88:18:88:35 | ...[...] | ActiveRecordInjection.rb:88:18:88:23 | call to params : | ActiveRecordInjection.rb:88:18:88:35 | ...[...] | This SQL query depends on $@. | ActiveRecordInjection.rb:88:18:88:23 | call to params | a user-provided value |
| ActiveRecordInjection.rb:92:21:92:35 | ...[...] | ActiveRecordInjection.rb:92:21:92:26 | call to params : | ActiveRecordInjection.rb:92:21:92:35 | ...[...] | This SQL query depends on $@. | ActiveRecordInjection.rb:92:21:92:26 | call to params | a user-provided value |
| ActiveRecordInjection.rb:104:20:104:32 | ... + ... | ActiveRecordInjection.rb:98:10:98:15 | call to params : | ActiveRecordInjection.rb:104:20:104:32 | ... + ... | This SQL query depends on $@. | ActiveRecordInjection.rb:98:10:98:15 | call to params | a user-provided value |
| ActiveRecordInjection.rb:151:27:151:76 | "this is an unsafe annotation:..." | ActiveRecordInjection.rb:151:59:151:64 | call to params : | ActiveRecordInjection.rb:151:27:151:76 | "this is an unsafe annotation:..." | This SQL query depends on $@. | ActiveRecordInjection.rb:151:59:151:64 | call to params | a user-provided value |