Merge pull request #26 from github/aibaars/tokens

Store tokens into separate table
This commit is contained in:
Arthur Baars
2020-11-05 14:03:26 +01:00
committed by GitHub
8 changed files with 507 additions and 2070 deletions

View File

@@ -187,6 +187,7 @@ impl Extractor {
// TODO: should we handle path strings that are not valid UTF8 better?
path: format!("{}", path.display()),
file_label: *file_label,
token_counter: 0,
stack: Vec::new(),
tables: build_schema_lookup(&self.schema),
union_types: build_union_type_lookup(&self.schema),
@@ -248,7 +249,7 @@ fn full_id_for_folder(path: &Path) -> String {
fn build_schema_lookup<'a>(schema: &'a Vec<Entry>) -> Map<&'a TypeName, &'a Entry> {
let mut map = std::collections::BTreeMap::new();
for entry in schema {
if let Entry::Table { type_name, .. } = entry {
if let Entry::Token { type_name, .. } | Entry::Table { type_name, .. } = entry {
map.insert(type_name, entry);
}
}
@@ -275,6 +276,8 @@ struct Visitor<'a> {
source: &'a Vec<u8>,
/// A TrapWriter to accumulate trap entries
trap_writer: TrapWriter,
/// A counter for tokens
token_counter: usize,
/// A lookup table from type name to dbscheme table entries
tables: Map<&'a TypeName, &'a Entry>,
/// A lookup table for union types mapping a type name to its direct members
@@ -290,14 +293,18 @@ struct Visitor<'a> {
impl Visitor<'_> {
fn enter_node(&mut self, node: Node) -> bool {
if node.is_error() {
error!("{}:{}: parse error", &self.path, node.start_position().row);
error!(
"{}:{}: parse error",
&self.path,
node.start_position().row + 1
);
return false;
}
if node.is_missing() {
error!(
"{}:{}: parse error: expecting '{}'",
&self.path,
node.start_position().row,
node.start_position().row + 1,
node.kind()
);
return false;
@@ -312,61 +319,73 @@ impl Visitor<'_> {
return;
}
let child_nodes = self.stack.pop().expect("Vistor: empty stack");
let id = self.trap_writer.fresh_id();
let (start_line, start_column, end_line, end_column) = location_for(&self.source, node);
let loc = self.trap_writer.location(
self.file_label.clone(),
start_line,
start_column,
end_line,
end_column,
);
let table = self.tables.get(&TypeName {
kind: node.kind().to_owned(),
named: node.is_named(),
});
if let Some(Entry::Table { fields, .. }) = table {
let id = self.trap_writer.fresh_id();
let (start_line, start_column, end_line, end_column) = location_for(&self.source, node);
let loc = self.trap_writer.location(
self.file_label.clone(),
start_line,
start_column,
end_line,
end_column,
);
let table_name = escape_name(&format!(
"{}_def",
node_type_name(node.kind(), node.is_named())
));
let args: Option<Vec<Arg>>;
if fields.is_empty() {
args = Some(vec![sliced_source_arg(self.source, node)]);
} else {
args = self.complex_node(&node, fields, child_nodes, id);
let mut valid = true;
match table {
Some(Entry::Token { kind_id, .. }) => {
self.trap_writer.add_tuple(
"tokeninfo",
vec![
Arg::Label(id),
Arg::Int(*kind_id),
Arg::Label(self.file_label),
Arg::Int(self.token_counter),
sliced_source_arg(self.source, node),
Arg::Label(loc),
],
);
self.token_counter += 1;
}
if let Some(args) = args {
let mut all_args = Vec::new();
all_args.push(Arg::Label(id));
all_args.extend(args);
all_args.push(Arg::Label(loc));
self.trap_writer.add_tuple(&table_name, all_args);
Some(Entry::Table { fields, .. }) => {
let table_name = escape_name(&format!(
"{}_def",
node_type_name(node.kind(), node.is_named())
));
if let Some(args) = self.complex_node(&node, fields, child_nodes, id) {
let mut all_args = Vec::new();
all_args.push(Arg::Label(id));
all_args.extend(args);
all_args.push(Arg::Label(loc));
self.trap_writer.add_tuple(&table_name, all_args);
}
}
if !node.is_extra() {
// Extra nodes are independent root nodes and do not belong to the parent node
// Therefore we should not register them in the parent vector
if let Some(parent) = self.stack.last_mut() {
parent.push((
field_name,
id,
TypeName {
kind: node.kind().to_owned(),
named: node.is_named(),
},
))
};
_ => {
error!(
"{}:{}: unknown table type: '{}'",
&self.path,
node.start_position().row + 1,
node.kind()
);
valid = false;
}
} else {
error!(
"{}:{}: unknown table type: '{}'",
&self.path,
node.start_position().row,
node.kind()
);
}
if valid && !node.is_extra() {
// Extra nodes are independent root nodes and do not belong to the parent node
// Therefore we should not register them in the parent vector
if let Some(parent) = self.stack.last_mut() {
parent.push((
field_name,
id,
TypeName {
kind: node.kind().to_owned(),
named: node.is_named(),
},
))
};
}
}
fn complex_node(
&mut self,
node: &Node,
@@ -387,7 +406,7 @@ impl Visitor<'_> {
error!(
"{}:{}: type mismatch for field {}::{} with type {:?} != {:?}",
&self.path,
node.start_position().row,
node.start_position().row + 1,
node.kind(),
child_field.unwrap_or("child"),
child_type,
@@ -399,7 +418,7 @@ impl Visitor<'_> {
error!(
"{}:{}: value for unknown field: {}::{} and type {:?}",
&self.path,
node.start_position().row,
node.start_position().row + 1,
node.kind(),
&child_field.unwrap_or("child"),
&child_type
@@ -420,7 +439,7 @@ impl Visitor<'_> {
error!(
"{}:{}: {} for field: {}::{}",
&self.path,
node.start_position().row,
node.start_position().row + 1,
if child_ids.is_empty() {
"missing value"
} else {
@@ -437,7 +456,7 @@ impl Visitor<'_> {
error!(
"{}:{}: too many values for field: {}::{}",
&self.path,
node.start_position().row,
node.start_position().row + 1,
node.kind(),
&field.get_name()
);
@@ -483,9 +502,7 @@ impl Visitor<'_> {
// Emit a slice of a source file as an Arg.
fn sliced_source_arg(source: &Vec<u8>, n: Node) -> Arg {
let range = n.byte_range();
Arg::String(String::from(
std::str::from_utf8(&source[range.start..range.end]).expect("Failed to decode string"),
))
Arg::String(String::from_utf8_lossy(&source[range.start..range.end]).into_owned())
}
// Emit a pair of `TrapEntry`s for the provided node, appropriately calibrated.

View File

@@ -5,7 +5,8 @@ use std::fmt;
pub enum Entry {
/// An entry defining a database table.
Table(Table),
/// An entry defining a database table.
Case(Case),
/// An entry defining type that is a union of other types.
Union(Union),
}
@@ -23,6 +24,13 @@ pub struct Union {
pub members: Vec<String>,
}
/// A table in the database schema.
pub struct Case {
pub name: String,
pub column: String,
pub branches: Vec<(usize, String)>,
}
/// A column in a table.
pub struct Column {
pub db_type: DbColumnType,
@@ -38,6 +46,18 @@ pub enum DbColumnType {
String,
}
impl fmt::Display for Case {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "case @{}.{} of", &self.name, &self.column)?;
let mut sep = " ";
for (c, tp) in &self.branches {
writeln!(f, "{} {} = @{}", sep, c, tp)?;
sep = "|";
}
writeln!(f, ";")
}
}
impl fmt::Display for Table {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if let Some(keyset) = &self.keysets {
@@ -110,6 +130,7 @@ pub fn write(
for entry in entries {
match entry {
Entry::Case(case) => write!(file, "{}\n\n", case)?,
Entry::Table(table) => write!(file, "{}\n\n", table)?,
Entry::Union(union) => write!(file, "{}\n\n", union)?,
}

View File

@@ -4,16 +4,31 @@ mod ql;
mod ql_gen;
use language::Language;
use std::collections::BTreeMap as Map;
use std::collections::BTreeSet as Set;
use std::fs::File;
use std::io::LineWriter;
use std::path::PathBuf;
use tracing::{error, info};
fn child_node_type_name(token_types: &Map<String, usize>, t: &node_types::TypeName) -> String {
if !t.named {
// an unnamed token
"reserved_word".to_owned()
} else if token_types.contains_key(&t.kind) {
// a named token
format!("token_{}", t.kind)
} else {
// a normal node
node_types::node_type_name(&t.kind, t.named)
}
}
/// Given the name of the parent node, and its field information, returns the
/// name of the field's type. This may be an ad-hoc union of all the possible
/// types the field can take, in which case the union is added to `entries`.
fn make_field_type(
token_types: &Map<String, usize>,
parent_name: &str,
field_name: &str,
types: &Set<node_types::TypeName>,
@@ -22,7 +37,7 @@ fn make_field_type(
if types.len() == 1 {
// This field can only have a single type.
let t = types.iter().next().unwrap();
node_types::escape_name(&node_types::node_type_name(&t.kind, t.named))
node_types::escape_name(&child_node_type_name(token_types, t))
} else {
// This field can have one of several types. Create an ad-hoc QL union
// type to represent them.
@@ -30,7 +45,7 @@ fn make_field_type(
let field_union_name = node_types::escape_name(&field_union_name);
let members: Vec<String> = types
.iter()
.map(|t| node_types::escape_name(&node_types::node_type_name(&t.kind, t.named)))
.map(|t| node_types::escape_name(&child_node_type_name(token_types, t)))
.collect();
entries.push(dbscheme::Entry::Union(dbscheme::Union {
name: field_union_name.clone(),
@@ -43,6 +58,7 @@ fn make_field_type(
/// Adds the appropriate dbscheme information for the given field, either as a
/// column on `main_table`, or as an auxiliary table.
fn add_field(
token_types: &Map<String, usize>,
main_table: &mut dbscheme::Table,
field: &node_types::Field,
entries: &mut Vec<dbscheme::Entry>,
@@ -53,7 +69,13 @@ fn add_field(
node_types::Storage::Table(has_index) => {
// This field can appear zero or multiple times, so put
// it in an auxiliary table.
let field_type = make_field_type(&parent_name, &field_name, &field.types, entries);
let field_type = make_field_type(
token_types,
&parent_name,
&field_name,
&field.types,
entries,
);
let parent_column = dbscheme::Column {
unique: !*has_index,
db_type: dbscheme::DbColumnType::Int,
@@ -98,7 +120,13 @@ fn add_field(
node_types::Storage::Column => {
// This field must appear exactly once, so we add it as
// a column to the main table for the node type.
let field_type = make_field_type(&parent_name, &field_name, &field.types, entries);
let field_type = make_field_type(
token_types,
&parent_name,
&field_name,
&field.types,
entries,
);
main_table.columns.push(dbscheme::Column {
unique: false,
db_type: dbscheme::DbColumnType::Int,
@@ -124,6 +152,15 @@ fn convert_nodes(nodes: &Vec<node_types::Entry>) -> Vec<dbscheme::Entry> {
create_source_location_prefix_table(),
];
let mut ast_node_members: Vec<String> = Vec::new();
let mut token_kinds: Map<String, usize> = Map::new();
ast_node_members.push(node_types::escape_name("token"));
for node in nodes {
if let node_types::Entry::Token { type_name, kind_id } = node {
if type_name.named {
token_kinds.insert(type_name.kind.to_owned(), *kind_id);
}
}
}
for node in nodes {
match &node {
@@ -135,9 +172,9 @@ fn convert_nodes(nodes: &Vec<node_types::Entry>) -> Vec<dbscheme::Entry> {
// type.
let mut members: Vec<String> = Vec::new();
for n_member in n_members {
members.push(node_types::escape_name(&node_types::node_type_name(
&n_member.kind,
n_member.named,
members.push(node_types::escape_name(&child_node_type_name(
&token_kinds,
n_member,
)))
}
entries.push(dbscheme::Entry::Union(dbscheme::Union {
@@ -167,7 +204,7 @@ fn convert_nodes(nodes: &Vec<node_types::Entry>) -> Vec<dbscheme::Entry> {
// If the type also has fields or children, then we create either
// auxiliary tables or columns in the defining table for them.
for field in fields {
add_field(&mut main_table, &field, &mut entries);
add_field(&token_kinds, &mut main_table, &field, &mut entries);
}
if fields.is_empty() {
@@ -193,9 +230,13 @@ fn convert_nodes(nodes: &Vec<node_types::Entry>) -> Vec<dbscheme::Entry> {
entries.push(dbscheme::Entry::Table(main_table));
}
node_types::Entry::Token { .. } => {}
}
}
// Add the tokeninfo table
add_tokeninfo_table(&mut entries, token_kinds);
// Create a union of all database types.
entries.push(dbscheme::Entry::Union(dbscheme::Union {
name: "ast_node".to_string(),
@@ -205,6 +246,71 @@ fn convert_nodes(nodes: &Vec<node_types::Entry>) -> Vec<dbscheme::Entry> {
entries
}
fn add_tokeninfo_table(entries: &mut Vec<dbscheme::Entry>, token_kinds: Map<String, usize>) {
entries.push(dbscheme::Entry::Table(dbscheme::Table {
name: "tokeninfo".to_owned(),
keysets: None,
columns: vec![
dbscheme::Column {
db_type: dbscheme::DbColumnType::Int,
name: "id".to_string(),
unique: true,
ql_type: ql::Type::AtType("token".to_owned()),
ql_type_is_ref: false,
},
dbscheme::Column {
unique: false,
db_type: dbscheme::DbColumnType::Int,
name: "kind".to_string(),
ql_type: ql::Type::Int,
ql_type_is_ref: true,
},
dbscheme::Column {
unique: false,
db_type: dbscheme::DbColumnType::Int,
name: "file".to_string(),
ql_type: ql::Type::AtType("file".to_string()),
ql_type_is_ref: true,
},
dbscheme::Column {
unique: false,
db_type: dbscheme::DbColumnType::Int,
name: "idx".to_string(),
ql_type: ql::Type::Int,
ql_type_is_ref: true,
},
dbscheme::Column {
unique: false,
db_type: dbscheme::DbColumnType::String,
name: "value".to_string(),
ql_type: ql::Type::String,
ql_type_is_ref: true,
},
dbscheme::Column {
unique: false,
db_type: dbscheme::DbColumnType::Int,
name: "loc".to_string(),
ql_type: ql::Type::AtType("location".to_string()),
ql_type_is_ref: true,
},
],
}));
let mut branches: Vec<(usize, String)> = Vec::new();
branches.push((0, "reserved_word".to_owned()));
for (token_kind, idx) in token_kinds.iter() {
branches.push((
*idx,
node_types::escape_name(&format!("token_{}", token_kind)),
));
}
entries.push(dbscheme::Entry::Case(dbscheme::Case {
name: "token".to_owned(),
column: "kind".to_owned(),
branches: branches,
}));
}
fn write_dbscheme(language: &Language, entries: &[dbscheme::Entry]) -> std::io::Result<()> {
info!(
"Writing database schema for {} to '{}'",

View File

@@ -69,6 +69,7 @@ fn create_supertype_map(nodes: &[node_types::Entry]) -> SupertypeMap {
}
}
}
node_types::Entry::Token { .. } => {}
}
}
@@ -142,7 +143,67 @@ fn create_ast_node_class() -> ql::Class {
],
}
}
fn create_token_class() -> ql::Class {
let get_value = ql::Predicate {
name: "getValue".to_owned(),
overridden: false,
return_type: Some(ql::Type::String),
formal_parameters: vec![],
body: ql::Expression::Pred(
"tokeninfo".to_owned(),
vec![
ql::Expression::Var("this".to_owned()),
ql::Expression::Var("_".to_owned()),
ql::Expression::Var("_".to_owned()),
ql::Expression::Var("_".to_owned()),
ql::Expression::Var("result".to_owned()),
ql::Expression::Var("_".to_owned()),
],
),
};
let get_location = ql::Predicate {
name: "getLocation".to_owned(),
overridden: true,
return_type: Some(ql::Type::Normal("Location".to_owned())),
formal_parameters: vec![],
body: ql::Expression::Pred(
"tokeninfo".to_owned(),
vec![
ql::Expression::Var("this".to_owned()),
ql::Expression::Var("_".to_owned()),
ql::Expression::Var("_".to_owned()),
ql::Expression::Var("_".to_owned()),
ql::Expression::Var("_".to_owned()),
ql::Expression::Var("result".to_owned()),
],
),
};
let to_string = ql::Predicate {
name: "toString".to_owned(),
overridden: true,
return_type: Some(ql::Type::String),
formal_parameters: vec![],
body: ql::Expression::Equals(
Box::new(ql::Expression::Var("result".to_owned())),
Box::new(ql::Expression::Pred("getValue".to_owned(), vec![])),
),
};
ql::Class {
name: "Token".to_owned(),
is_abstract: false,
supertypes: vec![
ql::Type::AtType("token".to_owned()),
ql::Type::Normal("AstNode".to_owned()),
],
characteristic_predicate: None,
predicates: vec![
get_value,
get_location,
to_string,
create_describe_ql_class("Token"),
],
}
}
/// Creates a predicate whose body is `none()`.
fn create_none_predicate(
name: &str,
@@ -164,6 +225,7 @@ fn create_none_predicate(
/// types the field can take, in which case we create a new class and push it to
/// `classes`.
fn create_field_class(
token_kinds: &BTreeSet<String>,
parent_name: &str,
field: &node_types::Field,
classes: &mut Vec<ql::TopLevel>,
@@ -172,7 +234,11 @@ fn create_field_class(
if field.types.len() == 1 {
// This field can only have a single type.
let t = field.types.iter().next().unwrap();
node_types::escape_name(&node_types::node_type_name(&t.kind, t.named))
if !t.named || token_kinds.contains(&t.kind) {
"Token".to_owned()
} else {
node_types::escape_name(&node_types::node_type_name(&t.kind, t.named))
}
} else {
// This field can have one of several types. The dbscheme contains a
// union type, so we create a QL class to wrap that.
@@ -188,7 +254,7 @@ fn create_field_class(
]
.concat(),
characteristic_predicate: None,
predicates: vec![create_describe_ql_class(&class_name)],
predicates: vec![],
}));
field_union_name
}
@@ -351,16 +417,18 @@ fn create_field_getters(
field: &node_types::Field,
field_type: &str,
) -> (ql::Predicate, ql::Expression) {
let predicate_name = format!(
"get{}",
dbscheme_name_to_class_name(&node_types::escape_name(&field.get_name()))
);
let return_type = Some(ql::Type::Normal(dbscheme_name_to_class_name(field_type)));
match &field.storage {
node_types::Storage::Column => {
let result = (
ql::Predicate {
name: format!(
"get{}",
dbscheme_name_to_class_name(&node_types::escape_name(&field.get_name()))
),
name: predicate_name,
overridden: false,
return_type: Some(ql::Type::Normal(dbscheme_name_to_class_name(field_type))),
return_type: return_type,
formal_parameters: vec![],
body: create_get_field_expr_for_column_storage(
&main_table_name,
@@ -381,12 +449,9 @@ fn create_field_getters(
let field_table_name = format!("{}_{}", parent_name, &field.get_name());
(
ql::Predicate {
name: format!(
"get{}",
dbscheme_name_to_class_name(&node_types::escape_name(&field.get_name()))
),
name: predicate_name,
overridden: false,
return_type: Some(ql::Type::Normal(dbscheme_name_to_class_name(field_type))),
return_type: return_type,
formal_parameters: if *has_index {
vec![ql::FormalParameter {
name: "i".to_owned(),
@@ -416,10 +481,22 @@ pub fn convert_nodes(nodes: &Vec<node_types::Entry>) -> Vec<ql::TopLevel> {
ql::TopLevel::Import("codeql.files.FileSystem".to_owned()),
ql::TopLevel::Import("codeql.Locations".to_owned()),
ql::TopLevel::Class(create_ast_node_class()),
ql::TopLevel::Class(create_token_class()),
];
let mut token_kinds = BTreeSet::new();
for node in nodes {
if let node_types::Entry::Token { type_name, .. } = node {
if type_name.named {
token_kinds.insert(type_name.kind.to_owned());
}
}
}
for node in nodes {
match &node {
node_types::Entry::Token { .. } => {
// don't generate any QL code for tokens
}
node_types::Entry::Union {
type_name,
members: _,
@@ -440,7 +517,7 @@ pub fn convert_nodes(nodes: &Vec<node_types::Entry>) -> Vec<ql::TopLevel> {
]
.concat(),
characteristic_predicate: None,
predicates: vec![create_describe_ql_class(&class_name)],
predicates: vec![],
}));
}
node_types::Entry::Table { type_name, fields } => {
@@ -492,8 +569,13 @@ pub fn convert_nodes(nodes: &Vec<node_types::Entry>) -> Vec<ql::TopLevel> {
// - predicates to access the fields,
// - the QL expressions to access the fields that will be part of getAFieldOrChild.
for field in fields {
let field_type =
create_field_class(&name, field, &mut classes, &supertype_map);
let field_type = create_field_class(
&token_kinds,
&name,
field,
&mut classes,
&supertype_map,
);
let (get_pred, get_child_expr) = create_field_getters(
&main_table_name,
main_table_arity,

View File

@@ -15,6 +15,10 @@ pub enum Entry {
type_name: TypeName,
fields: Vec<Field>,
},
Token {
type_name: TypeName,
kind_id: usize,
},
}
#[derive(Debug, Ord, PartialOrd, Eq, PartialEq)]
@@ -75,7 +79,7 @@ fn convert_types(node_types: &Vec<NodeType>) -> Set<TypeName> {
}
pub fn convert_nodes(nodes: Vec<NodeInfo>) -> Vec<Entry> {
let mut entries: Vec<Entry> = Vec::new();
let mut token_kinds = Set::new();
for node in nodes {
if let Some(subtypes) = &node.subtypes {
// It's a tree-sitter supertype node, for which we create a union
@@ -87,6 +91,12 @@ pub fn convert_nodes(nodes: Vec<NodeInfo>) -> Vec<Entry> {
},
members: convert_types(&subtypes),
});
} else if node.fields.as_ref().map_or(0, |x| x.len()) == 0 && node.children.is_none() {
let type_name = TypeName {
kind: node.kind,
named: node.named,
};
token_kinds.insert(type_name);
} else {
// It's a product type, defined by a table.
let type_name = TypeName {
@@ -114,6 +124,16 @@ pub fn convert_nodes(nodes: Vec<NodeInfo>) -> Vec<Entry> {
entries.push(Entry::Table { type_name, fields });
}
}
let mut counter = 0;
for type_name in token_kinds {
let kind_id = if type_name.named {
counter += 1;
counter
} else {
0
};
entries.push(Entry::Token { type_name, kind_id });
}
entries
}

File diff suppressed because it is too large Load Diff

View File

@@ -26,20 +26,22 @@ class PrintAstConfiguration extends string {
class PrintAstNode extends AstNode {
string getProperty(string key) {
key = "semmle.label" and
result = this.toString()
result = "[" + this.describeQlClass() + "] " + this.toString()
}
/**
* Gets a textual representation of this node in the PrintAST output tree.
*/
override string toString() { result = "[" + this.describeQlClass() + "] " + super.toString() }
/**
* Holds if this node should be printed in the output. By default, all nodes
* are printed, but the query can override
* `PrintAstConfiguration.shouldPrintNode` to filter the output.
*/
predicate shouldPrint() { shouldPrintNode(this) }
predicate shouldPrint() {
(
not this instanceof Token
or
exists(AstNode parent | parent.getAFieldOrChild() = this)
) and
shouldPrintNode(this)
}
}
private predicate shouldPrintNode(AstNode n) {

File diff suppressed because it is too large Load Diff