Merge pull request #11956 from aibaars/json-log

Ruby: structured logging
This commit is contained in:
Arthur Baars
2023-02-03 09:54:49 +01:00
committed by GitHub
6 changed files with 406 additions and 101 deletions

BIN
ruby/Cargo.lock generated

Binary file not shown.

View File

@@ -20,3 +20,6 @@ num_cpus = "1.14.0"
regex = "1.7.1"
encoding = "0.2"
lazy_static = "1.4.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
chrono = { version = "0.4.19", features = ["serde"] }

View File

@@ -0,0 +1,268 @@
use serde::Serialize;
use std::io::Write;
use std::path::PathBuf;
/** SARIF severity */
#[derive(Serialize)]
pub enum Severity {
Error,
Warning,
#[allow(unused)]
Note,
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Source {
/** An identifier under which it makes sense to group this diagnostic message. This is used to build the SARIF reporting descriptor object.*/
pub id: String,
/** Display name for the ID. This is used to build the SARIF reporting descriptor object. */
pub name: String,
#[serde(skip_serializing_if = "Option::is_none")]
/** Name of the CodeQL extractor. This is used to identify which tool component the reporting descriptor object should be nested under in SARIF.*/
pub extractor_name: Option<String>,
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Visibility {
#[serde(skip_serializing_if = "std::ops::Not::not")]
/** True if the message should be displayed on the status page (defaults to false) */
pub status_page: bool,
#[serde(skip_serializing_if = "std::ops::Not::not")]
/** True if the message should be counted in the diagnostics summary table printed by `codeql database analyze` (defaults to false) */
pub cli_summary_table: bool,
#[serde(skip_serializing_if = "std::ops::Not::not")]
/** True if the message should be sent to telemetry (defaults to false) */
pub telemetry: bool,
}
#[derive(Serialize, Clone, Default)]
#[serde(rename_all = "camelCase")]
pub struct Location {
#[serde(skip_serializing_if = "Option::is_none")]
/** Path to the affected file if appropriate, relative to the source root */
pub file: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub start_line: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub start_column: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub end_line: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub end_column: Option<usize>,
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DiagnosticMessage {
/** Unix timestamp */
pub timestamp: chrono::DateTime<chrono::Utc>,
pub source: Source,
#[serde(skip_serializing_if = "String::is_empty")]
/** GitHub flavored Markdown formatted message. Should include inline links to any help pages. */
pub markdown_message: String,
#[serde(skip_serializing_if = "String::is_empty")]
/** Plain text message. Used by components where the string processing needed to support Markdown is cumbersome. */
pub plaintext_message: String,
#[serde(skip_serializing_if = "Vec::is_empty")]
/** List of help links intended to supplement the `plaintextMessage`. */
pub help_links: Vec<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub severity: Option<Severity>,
#[serde(skip_serializing_if = "std::ops::Not::not")]
pub internal: bool,
#[serde(skip_serializing_if = "is_default_visibility")]
pub visibility: Visibility,
#[serde(skip_serializing_if = "Option::is_none")]
pub location: Option<Location>,
}
fn is_default_visibility(v: &Visibility) -> bool {
!v.cli_summary_table && !v.status_page && !v.telemetry
}
pub struct LogWriter {
extractor: String,
path: Option<PathBuf>,
inner: Option<std::io::BufWriter<std::fs::File>>,
}
impl LogWriter {
pub fn message(&self, id: &str, name: &str) -> DiagnosticMessage {
DiagnosticMessage {
timestamp: chrono::Utc::now(),
source: Source {
id: format!("{}/{}", self.extractor, id),
name: name.to_owned(),
extractor_name: Some(self.extractor.to_owned()),
},
markdown_message: String::new(),
plaintext_message: String::new(),
help_links: vec![],
severity: None,
internal: false,
visibility: Visibility {
cli_summary_table: false,
status_page: false,
telemetry: false,
},
location: None,
}
}
pub fn write(&mut self, mesg: &DiagnosticMessage) {
let full_error_message = mesg.full_error_message();
match mesg.severity {
Some(Severity::Error) => tracing::error!("{}", full_error_message),
Some(Severity::Warning) => tracing::warn!("{}", full_error_message),
Some(Severity::Note) => tracing::info!("{}", full_error_message),
None => tracing::debug!("{}", full_error_message),
}
if self.inner.is_none() {
if let Some(path) = self.path.as_ref() {
match std::fs::OpenOptions::new()
.create(true)
.append(true)
.write(true)
.open(&path)
{
Err(e) => {
tracing::error!(
"Could not open log file '{}': {}",
&path.to_string_lossy(),
e
);
self.path = None;
self.inner = None
}
Ok(file) => self.inner = Some(std::io::BufWriter::new(file)),
}
}
}
if let Some(mut writer) = self.inner.as_mut() {
serde_json::to_writer(&mut writer, mesg)
.unwrap_or_else(|e| tracing::debug!("Failed to write log entry: {}", e));
&mut writer
.write_all(b"\n")
.unwrap_or_else(|e| tracing::debug!("Failed to write log entry: {}", e));
}
}
}
pub struct DiagnosticLoggers {
extractor: String,
root: Option<PathBuf>,
}
impl DiagnosticLoggers {
pub fn new(extractor: &str) -> Self {
let env_var = format!(
"CODEQL_EXTRACTOR_{}_DIAGNOSTIC_DIR",
extractor.to_ascii_uppercase()
);
let root = match std::env::var(&env_var) {
Err(e) => {
tracing::error!("{}: {}", e, &env_var);
None
}
Ok(dir) => {
if let Err(e) = std::fs::create_dir_all(&dir) {
tracing::error!("Failed to create log directory {}: {}", &dir, e);
None
} else {
Some(PathBuf::from(dir))
}
}
};
DiagnosticLoggers {
extractor: extractor.to_owned(),
root,
}
}
pub fn logger(&self) -> LogWriter {
thread_local! {
static THREAD_NUM: usize = {
static COUNT: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(0);
COUNT.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
};
}
THREAD_NUM.with(|n| LogWriter {
extractor: self.extractor.to_owned(),
inner: None,
path: self
.root
.as_ref()
.map(|root| root.to_owned().join(format!("extractor_{}.jsonl", n))),
})
}
}
impl DiagnosticMessage {
pub fn full_error_message(&self) -> String {
match &self.location {
Some(Location {
file: Some(path),
start_line: Some(line),
..
}) => format!("{}:{}: {}", path, line, self.plaintext_message),
_ => self.plaintext_message.to_owned(),
}
}
pub fn text(&mut self, text: &str) -> &mut Self {
self.plaintext_message = text.to_owned();
self
}
#[allow(unused)]
pub fn markdown(&mut self, text: &str) -> &mut Self {
self.markdown_message = text.to_owned();
self
}
pub fn severity(&mut self, severity: Severity) -> &mut Self {
self.severity = Some(severity);
self
}
#[allow(unused)]
pub fn help_link(&mut self, link: &str) -> &mut Self {
self.help_links.push(link.to_owned());
self
}
#[allow(unused)]
pub fn internal(&mut self) -> &mut Self {
self.internal = true;
self
}
#[allow(unused)]
pub fn cli_summary_table(&mut self) -> &mut Self {
self.visibility.cli_summary_table = true;
self
}
pub fn status_page(&mut self) -> &mut Self {
self.visibility.status_page = true;
self
}
#[allow(unused)]
pub fn telemetry(&mut self) -> &mut Self {
self.visibility.telemetry = true;
self
}
pub fn location(
&mut self,
path: &str,
start_line: usize,
start_column: usize,
end_line: usize,
end_column: usize,
) -> &mut Self {
let loc = self.location.get_or_insert(Default::default());
loc.file = Some(path.to_owned());
loc.start_line = Some(start_line);
loc.start_column = Some(start_column);
loc.end_line = Some(end_line);
loc.end_column = Some(end_column);
self
}
}

View File

@@ -1,3 +1,4 @@
use crate::diagnostics;
use crate::trap;
use node_types::{EntryKind, Field, NodeTypeMap, Storage, TypeName};
use std::collections::BTreeMap as Map;
@@ -5,7 +6,6 @@ use std::collections::BTreeSet as Set;
use std::fmt;
use std::path::Path;
use tracing::{error, info, span, Level};
use tree_sitter::{Language, Node, Parser, Range, Tree};
pub fn populate_file(writer: &mut trap::Writer, absolute_path: &Path) -> trap::Label {
@@ -114,21 +114,22 @@ pub fn extract(
language: Language,
language_prefix: &str,
schema: &NodeTypeMap,
diagnostics_writer: &mut diagnostics::LogWriter,
trap_writer: &mut trap::Writer,
path: &Path,
source: &[u8],
ranges: &[Range],
) -> std::io::Result<()> {
let path_str = format!("{}", path.display());
let span = span!(
Level::TRACE,
let span = tracing::span!(
tracing::Level::TRACE,
"extract",
file = %path_str
);
let _enter = span.enter();
info!("extracting: {}", path_str);
tracing::info!("extracting: {}", path_str);
let mut parser = Parser::new();
parser.set_language(language).unwrap();
@@ -138,6 +139,7 @@ pub fn extract(
let file_label = populate_file(trap_writer, path);
let mut visitor = Visitor::new(
source,
diagnostics_writer,
trap_writer,
// TODO: should we handle path strings that are not valid UTF8 better?
&path_str,
@@ -204,6 +206,8 @@ struct Visitor<'a> {
file_label: trap::Label,
/// The source code as a UTF-8 byte array
source: &'a [u8],
/// A diagnostics::LogWriter to write diagnostic messages
diagnostics_writer: &'a mut diagnostics::LogWriter,
/// A trap::Writer to accumulate trap entries
trap_writer: &'a mut trap::Writer,
/// A counter for top-level child nodes
@@ -226,6 +230,7 @@ struct Visitor<'a> {
impl<'a> Visitor<'a> {
fn new(
source: &'a [u8],
diagnostics_writer: &'a mut diagnostics::LogWriter,
trap_writer: &'a mut trap::Writer,
path: &'a str,
file_label: trap::Label,
@@ -236,6 +241,7 @@ impl<'a> Visitor<'a> {
path,
file_label,
source,
diagnostics_writer,
trap_writer,
toplevel_child_counter: 0,
ast_node_info_table_name: format!("{}_ast_node_info", language_prefix),
@@ -245,34 +251,31 @@ impl<'a> Visitor<'a> {
}
}
fn record_parse_error(
&mut self,
error_message: String,
full_error_message: String,
loc: trap::Label,
) {
error!("{}", full_error_message);
fn record_parse_error(&mut self, loc: trap::Label, mesg: &diagnostics::DiagnosticMessage) {
self.diagnostics_writer.write(mesg);
let id = self.trap_writer.fresh_id();
let full_error_message = mesg.full_error_message();
let severity_code = match mesg.severity {
Some(diagnostics::Severity::Error) => 40,
Some(diagnostics::Severity::Warning) => 30,
Some(diagnostics::Severity::Note) => 20,
None => 10,
};
self.trap_writer.add_tuple(
"diagnostics",
vec![
trap::Arg::Label(id),
trap::Arg::Int(40), // severity 40 = error
trap::Arg::Int(severity_code),
trap::Arg::String("parse_error".to_string()),
trap::Arg::String(error_message),
trap::Arg::String(mesg.plaintext_message.to_owned()),
trap::Arg::String(full_error_message),
trap::Arg::Label(loc),
],
);
}
fn record_parse_error_for_node(
&mut self,
error_message: String,
full_error_message: String,
node: Node,
) {
let (start_line, start_column, end_line, end_column) = location_for(self.source, node);
fn record_parse_error_for_node(&mut self, error_message: String, node: Node) {
let (start_line, start_column, end_line, end_column) = location_for(self, node);
let loc = location(
self.trap_writer,
self.file_label,
@@ -281,7 +284,14 @@ impl<'a> Visitor<'a> {
end_line,
end_column,
);
self.record_parse_error(error_message, full_error_message, loc);
self.record_parse_error(
loc,
self.diagnostics_writer
.message("parse-error", "Parse error")
.severity(diagnostics::Severity::Error)
.location(self.path, start_line, start_column, end_line, end_column)
.text(&error_message),
);
}
fn enter_node(&mut self, node: Node) -> bool {
@@ -291,13 +301,7 @@ impl<'a> Visitor<'a> {
} else {
"parse error".to_string()
};
let full_error_message = format!(
"{}:{}: {}",
&self.path,
node.start_position().row + 1,
error_message
);
self.record_parse_error_for_node(error_message, full_error_message, node);
self.record_parse_error_for_node(error_message, node);
return false;
}
@@ -312,7 +316,7 @@ impl<'a> Visitor<'a> {
return;
}
let (id, _, child_nodes) = self.stack.pop().expect("Vistor: empty stack");
let (start_line, start_column, end_line, end_column) = location_for(self.source, node);
let (start_line, start_column, end_line, end_column) = location_for(self, node);
let loc = location(
self.trap_writer,
self.file_label,
@@ -380,13 +384,15 @@ impl<'a> Visitor<'a> {
}
_ => {
let error_message = format!("unknown table type: '{}'", node.kind());
let full_error_message = format!(
"{}:{}: {}",
&self.path,
node.start_position().row + 1,
error_message
self.record_parse_error(
loc,
self.diagnostics_writer
.message("parse-error", "Parse error")
.severity(diagnostics::Severity::Error)
.location(self.path, start_line, start_column, end_line, end_column)
.text(&error_message)
.status_page(),
);
self.record_parse_error(error_message, full_error_message, loc);
valid = false;
}
@@ -440,13 +446,7 @@ impl<'a> Visitor<'a> {
child_node.type_name,
field.type_info
);
let full_error_message = format!(
"{}:{}: {}",
&self.path,
node.start_position().row + 1,
error_message
);
self.record_parse_error_for_node(error_message, full_error_message, *node);
self.record_parse_error_for_node(error_message, *node);
}
} else if child_node.field_name.is_some() || child_node.type_name.named {
let error_message = format!(
@@ -455,13 +455,7 @@ impl<'a> Visitor<'a> {
&child_node.field_name.unwrap_or("child"),
&child_node.type_name
);
let full_error_message = format!(
"{}:{}: {}",
&self.path,
node.start_position().row + 1,
error_message
);
self.record_parse_error_for_node(error_message, full_error_message, *node);
self.record_parse_error_for_node(error_message, *node);
}
}
let mut args = Vec::new();
@@ -484,13 +478,7 @@ impl<'a> Visitor<'a> {
node.kind(),
column_name
);
let full_error_message = format!(
"{}:{}: {}",
&self.path,
node.start_position().row + 1,
error_message
);
self.record_parse_error_for_node(error_message, full_error_message, *node);
self.record_parse_error_for_node(error_message, *node);
}
}
Storage::Table {
@@ -505,17 +493,8 @@ impl<'a> Visitor<'a> {
node.kind(),
table_name,
);
let full_error_message = format!(
"{}:{}: {}",
&self.path,
node.start_position().row + 1,
error_message
);
self.record_parse_error_for_node(
error_message,
full_error_message,
*node,
);
self.record_parse_error_for_node(error_message, *node);
break;
}
let mut args = vec![trap::Arg::Label(parent_id)];
@@ -582,7 +561,7 @@ fn sliced_source_arg(source: &[u8], n: Node) -> trap::Arg {
// Emit a pair of `TrapEntry`s for the provided node, appropriately calibrated.
// The first is the location and label definition, and the second is the
// 'Located' entry.
fn location_for(source: &[u8], n: Node) -> (usize, usize, usize, usize) {
fn location_for(visitor: &mut Visitor, n: Node) -> (usize, usize, usize, usize) {
// Tree-sitter row, column values are 0-based while CodeQL starts
// counting at 1. In addition Tree-sitter's row and column for the
// end position are exclusive while CodeQL's end positions are inclusive.
@@ -600,6 +579,7 @@ fn location_for(source: &[u8], n: Node) -> (usize, usize, usize, usize) {
end_line = start_line;
end_col = start_col - 1;
} else if end_col == 0 {
let source = visitor.source;
// end_col = 0 means that we are at the start of a line
// unfortunately 0 is invalid as column number, therefore
// we should update the end location to be the end of the
@@ -608,7 +588,14 @@ fn location_for(source: &[u8], n: Node) -> (usize, usize, usize, usize) {
if index > 0 && index <= source.len() {
index -= 1;
if source[index] != b'\n' {
error!("expecting a line break symbol, but none found while correcting end column value");
visitor.diagnostics_writer.write(
visitor
.diagnostics_writer
.message("internal-error", "Internal error")
.text("expecting a line break symbol, but none found while correcting end column value")
.status_page()
.severity(diagnostics::Severity::Error),
);
}
end_line -= 1;
end_col = 1;
@@ -617,10 +604,17 @@ fn location_for(source: &[u8], n: Node) -> (usize, usize, usize, usize) {
end_col += 1;
}
} else {
error!(
"cannot correct end column value: end_byte index {} is not in range [1,{}]",
index,
source.len()
visitor.diagnostics_writer.write(
visitor
.diagnostics_writer
.message("internal-error", "Internal error")
.text(&format!(
"cannot correct end column value: end_byte index {} is not in range [1,{}]",
index,
source.len()
))
.status_page()
.severity(diagnostics::Severity::Error),
);
}
}

View File

@@ -1,3 +1,4 @@
mod diagnostics;
mod extractor;
mod trap;
@@ -24,22 +25,19 @@ use tree_sitter::{Language, Parser, Range};
* of cores available on the machine to determine how many threads to use
* (minimum of 1). If unspecified, should be considered as set to -1."
*/
fn num_codeql_threads() -> usize {
fn num_codeql_threads() -> Result<usize, String> {
let threads_str = std::env::var("CODEQL_THREADS").unwrap_or_else(|_| "-1".to_owned());
match threads_str.parse::<i32>() {
Ok(num) if num <= 0 => {
let reduction = -num as usize;
std::cmp::max(1, num_cpus::get() - reduction)
Ok(std::cmp::max(1, num_cpus::get() - reduction))
}
Ok(num) => num as usize,
Ok(num) => Ok(num as usize),
Err(_) => {
tracing::error!(
"Unable to parse CODEQL_THREADS value '{}'; defaulting to 1 thread.",
&threads_str
);
1
}
Err(_) => Err(format!(
"Unable to parse CODEQL_THREADS value '{}'",
&threads_str
)),
}
}
@@ -68,7 +66,21 @@ fn main() -> std::io::Result<()> {
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("ruby_extractor=warn")),
)
.init();
let num_threads = num_codeql_threads();
let diagnostics = diagnostics::DiagnosticLoggers::new("ruby");
let mut main_thread_logger = diagnostics.logger();
let num_threads = match num_codeql_threads() {
Ok(num) => num,
Err(e) => {
main_thread_logger.write(
main_thread_logger
.message("configuration-error", "Configuration error")
.text(&format!("{}; defaulting to 1 thread.", e))
.status_page()
.severity(diagnostics::Severity::Warning),
);
1
}
};
tracing::info!(
"Using {} {}",
num_threads,
@@ -78,6 +90,20 @@ fn main() -> std::io::Result<()> {
"threads"
}
);
let trap_compression = match trap::Compression::from_env("CODEQL_RUBY_TRAP_COMPRESSION") {
Ok(x) => x,
Err(e) => {
main_thread_logger.write(
main_thread_logger
.message("configuration-error", "Configuration error")
.text(&format!("{}; using gzip.", e))
.status_page()
.severity(diagnostics::Severity::Warning),
);
trap::Compression::Gzip
}
};
drop(main_thread_logger);
rayon::ThreadPoolBuilder::new()
.num_threads(num_threads)
.build_global()
@@ -100,7 +126,6 @@ fn main() -> std::io::Result<()> {
.value_of("output-dir")
.expect("missing --output-dir");
let trap_dir = PathBuf::from(trap_dir);
let trap_compression = trap::Compression::from_env("CODEQL_RUBY_TRAP_COMPRESSION");
let file_list = matches.value_of("file-list").expect("missing --file-list");
let file_list = fs::File::open(file_list)?;
@@ -119,6 +144,7 @@ fn main() -> std::io::Result<()> {
lines
.par_iter()
.try_for_each(|line| {
let mut diagnostics_writer = diagnostics.logger();
let path = PathBuf::from(line).canonicalize()?;
let src_archive_file = path_for(&src_archive_dir, &path, "");
let mut source = std::fs::read(&path)?;
@@ -131,6 +157,7 @@ fn main() -> std::io::Result<()> {
erb,
"erb",
&erb_schema,
&mut diagnostics_writer,
&mut trap_writer,
&path,
&source,
@@ -170,20 +197,35 @@ fn main() -> std::io::Result<()> {
Ok(str) => source = str.as_bytes().to_owned(),
Err(msg) => {
needs_conversion = false;
tracing::warn!(
"{}: character decoding failure: {} ({})",
&path.to_string_lossy(),
msg,
&encoding_name
diagnostics_writer.write(
diagnostics_writer
.message(
"character-encoding-error",
"Character encoding error",
)
.text(&format!(
"{}: character decoding failure: {} ({})",
&path.to_string_lossy(),
msg,
&encoding_name
))
.status_page()
.severity(diagnostics::Severity::Warning),
);
}
}
}
} else {
tracing::warn!(
"{}: unknown character encoding: '{}'",
&path.to_string_lossy(),
&encoding_name
diagnostics_writer.write(
diagnostics_writer
.message("character-encoding-error", "Character encoding error")
.text(&format!(
"{}: unknown character encoding: '{}'",
&path.to_string_lossy(),
&encoding_name
))
.status_page()
.severity(diagnostics::Severity::Warning),
);
}
}
@@ -194,6 +236,7 @@ fn main() -> std::io::Result<()> {
language,
"ruby",
&schema,
&mut diagnostics_writer,
&mut trap_writer,
&path,
&source,

View File

@@ -224,17 +224,14 @@ pub enum Compression {
}
impl Compression {
pub fn from_env(var_name: &str) -> Compression {
pub fn from_env(var_name: &str) -> Result<Compression, String> {
match std::env::var(var_name) {
Ok(method) => match Compression::from_string(&method) {
Some(c) => c,
None => {
tracing::error!("Unknown compression method '{}'; using gzip.", &method);
Compression::Gzip
}
Some(c) => Ok(c),
None => Err(format!("Unknown compression method '{}'", &method)),
},
// Default compression method if the env var isn't set:
Err(_) => Compression::Gzip,
Err(_) => Ok(Compression::Gzip),
}
}